From 67e7e7a39a71d1e2672fd6e42f9104660efa2ac6 Mon Sep 17 00:00:00 2001 From: HappyAngel Date: Wed, 19 Jan 2022 14:45:04 +0800 Subject: [PATCH] fix boxcoder compute error (#8271) --- lite/backends/arm/math/box_coder.cc | 352 +++++++----------- lite/backends/arm/math/box_coder.h | 1 + lite/kernels/arm/box_coder_compute.cc | 1 + .../tests/unittest_py/op/test_box_coder_op.py | 16 +- 4 files changed, 135 insertions(+), 235 deletions(-) diff --git a/lite/backends/arm/math/box_coder.cc b/lite/backends/arm/math/box_coder.cc index 3f5326c3b8b..7f562fcbdd8 100644 --- a/lite/backends/arm/math/box_coder.cc +++ b/lite/backends/arm/math/box_coder.cc @@ -21,185 +21,92 @@ namespace lite { namespace arm { namespace math { void encode_bbox_center_kernel(const int batch_num, // N - const float* loc_data, - const float* prior_data, + const float* target_box_data, + const float* prior_box_data, const float* variance, const bool var_len4, + const bool normalized, const int num_priors, // M - float* bbox_data) { - int cnt = num_priors / 4; - //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax - //! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax - //! vvar - float32x4_t vhalf = vdupq_n_f32(0.5f); - - int len_batch = num_priors * 4; + float* output) { + int len = 4; + int size = batch_num * num_priors; + float norm_value = (normalized == false) ? 1 : 0; if (var_len4) { - for (int n = 0; n < batch_num; ++n) { - const float* ptr_loc_batch = loc_data + n * len_batch; - float* ptr_bbox_batch = bbox_data + n * len_batch; - - LITE_PARALLEL_BEGIN(i, tid, cnt) { - int idx = i * 16; - const float* ptr_loc = ptr_loc_batch + idx; - const float* ptr_prior = prior_data + idx; - float* ptr_bbox = ptr_bbox_batch + idx; - - float32x4x4_t vprior = vld4q_f32(ptr_prior); - float32x4x4_t vloc = vld4q_f32(ptr_loc); - float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); - float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); - float32x4_t vprior_cx = - vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf); - float32x4_t vprior_cy = - vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf); - - float32x4_t vdec_bbx_cx = - vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx); - float32x4_t vdec_bbx_cy = - vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy); - float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]); - float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]); - vprior_width = vmulq_f32(vprior_width, vhalf); - vprior_height = vmulq_f32(vprior_height, vhalf); - vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width); - vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height); - - vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w); - vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h); - vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w); - vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h); - - vst4q_f32(ptr_bbox, vloc); - for (int k = 0; k < 4; k++) { - ptr_bbox[k] /= variance[k]; - ptr_bbox[k + 4] /= variance[k]; - ptr_bbox[k + 8] /= variance[k]; - ptr_bbox[k + 12] /= variance[k]; - } - } - LITE_PARALLEL_END() - - LITE_PARALLEL_COMMON_BEGIN(i, tid, num_priors, cnt * 4, 1) { - int idx = i * 4; - float p_xmin = prior_data[idx]; - float p_ymin = prior_data[idx + 1]; - float p_xmax = prior_data[idx + 2]; - float p_ymax = prior_data[idx + 3]; - float prior_width = p_xmax - p_xmin; - float prior_height = p_ymax - p_ymin; - float prior_center_x = (p_xmin + p_xmax) / 2.f; - float prior_center_y = (p_ymin + p_ymax) / 2.f; - - float xmin = ptr_loc_batch[idx]; - float ymin = ptr_loc_batch[idx + 1]; - float xmax = ptr_loc_batch[idx + 2]; - float ymax = ptr_loc_batch[idx + 3]; - - //! variance is encoded in target, we simply need to retore the offset - //! predictions. - float decode_bbox_center_x = xmin * prior_width + prior_center_x; - float decode_bbox_center_y = ymin * prior_height + prior_center_y; - float decode_bbox_width = expf(xmax) * prior_width; - float decode_bbox_height = expf(ymax) * prior_height; - - ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; - ptr_bbox_batch[idx + 1] = - decode_bbox_center_y - decode_bbox_height / 2.f; - ptr_bbox_batch[idx + 2] = - decode_bbox_center_x + decode_bbox_width / 2.f; - ptr_bbox_batch[idx + 3] = - decode_bbox_center_y + decode_bbox_height / 2.f; - ptr_bbox_batch[idx] /= variance[0]; - ptr_bbox_batch[idx + 1] /= variance[1]; - ptr_bbox_batch[idx + 2] /= variance[2]; - ptr_bbox_batch[idx + 3] /= variance[3]; - } - LITE_PARALLEL_END() + LITE_PARALLEL_BEGIN(k, tid, size) { + int i = k / num_priors; + int j = k % num_priors; + size_t index_i = i * len; + size_t index_j = j * len; + size_t offset = k * len; + float prior_box_width = + prior_box_data[index_j + 2] - prior_box_data[index_j] + norm_value; + float prior_box_height = prior_box_data[index_j + 3] - + prior_box_data[index_j + 1] + norm_value; + float prior_box_center_x = prior_box_data[index_j] + prior_box_width / 2; + float prior_box_center_y = + prior_box_data[index_j + 1] + prior_box_height / 2; + + float target_box_center_x = + (target_box_data[index_i + 2] + target_box_data[index_i]) / 2; + float target_box_center_y = + (target_box_data[index_i + 3] + target_box_data[index_i + 1]) / 2; + float target_box_width = + target_box_data[index_i + 2] - target_box_data[index_i] + norm_value; + float target_box_height = target_box_data[index_i + 3] - + target_box_data[index_i + 1] + norm_value; + + output[offset] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[offset + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)); + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)); + output[offset] /= variance[0]; + output[offset + 1] /= variance[1]; + output[offset + 2] /= variance[2]; + output[offset + 3] /= variance[3]; } + LITE_PARALLEL_END() } else { - for (int n = 0; n < batch_num; ++n) { - const float* ptr_loc_batch = loc_data + n * len_batch; - float* ptr_bbox_batch = bbox_data + n * len_batch; - - LITE_PARALLEL_BEGIN(i, tid, cnt) { - int idx = i * 16; - const float* ptr_loc = ptr_loc_batch + idx; - const float* ptr_prior = prior_data + idx; - float* ptr_bbox = ptr_bbox_batch + idx; - - float32x4x4_t vprior = vld4q_f32(ptr_prior); - float32x4x4_t vloc = vld4q_f32(ptr_loc); - float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); - float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); - float32x4_t vprior_cx = - vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf); - float32x4_t vprior_cy = - vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf); - - float32x4_t vdec_bbx_cx = - vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx); - float32x4_t vdec_bbx_cy = - vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy); - float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]); - float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]); - vprior_width = vmulq_f32(vprior_width, vhalf); - vprior_height = vmulq_f32(vprior_height, vhalf); - vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width); - vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height); - - vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w); - vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h); - vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w); - vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h); - - vst4q_f32(ptr_bbox, vloc); - for (int k = 0; k < 4; k++) { - ptr_bbox[k] /= variance[idx + k]; - ptr_bbox[k + 4] /= variance[idx + k]; - ptr_bbox[k + 8] /= variance[idx + k]; - ptr_bbox[k + 12] /= variance[idx + k]; - } - } - LITE_PARALLEL_END() - - LITE_PARALLEL_COMMON_BEGIN(i, tid, num_priors, cnt * 4, 1) { - int idx = i * 4; - float p_xmin = prior_data[idx]; - float p_ymin = prior_data[idx + 1]; - float p_xmax = prior_data[idx + 2]; - float p_ymax = prior_data[idx + 3]; - float prior_width = p_xmax - p_xmin; - float prior_height = p_ymax - p_ymin; - float prior_center_x = (p_xmin + p_xmax) / 2.f; - float prior_center_y = (p_ymin + p_ymax) / 2.f; - - float xmin = ptr_loc_batch[idx]; - float ymin = ptr_loc_batch[idx + 1]; - float xmax = ptr_loc_batch[idx + 2]; - float ymax = ptr_loc_batch[idx + 3]; - - //! variance is encoded in target, we simply need to retore the offset - //! predictions. - float decode_bbox_center_x = xmin * prior_width + prior_center_x; - float decode_bbox_center_y = ymin * prior_height + prior_center_y; - float decode_bbox_width = expf(xmax) * prior_width; - float decode_bbox_height = expf(ymax) * prior_height; - - ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; - ptr_bbox_batch[idx + 1] = - decode_bbox_center_y - decode_bbox_height / 2.f; - ptr_bbox_batch[idx + 2] = - decode_bbox_center_x + decode_bbox_width / 2.f; - ptr_bbox_batch[idx + 3] = - decode_bbox_center_y + decode_bbox_height / 2.f; - ptr_bbox_batch[idx] /= variance[idx]; - ptr_bbox_batch[idx + 1] /= variance[idx + 1]; - ptr_bbox_batch[idx + 2] /= variance[idx + 2]; - ptr_bbox_batch[idx + 3] /= variance[idx + 3]; - } - LITE_PARALLEL_END() + LITE_PARALLEL_BEGIN(k, tid, size) { + int i = k / num_priors; + int j = k % num_priors; + size_t index_i = i * len; + size_t index_j = j * len; + size_t offset = k * len; + float prior_box_width = + prior_box_data[index_j + 2] - prior_box_data[index_j] + norm_value; + float prior_box_height = prior_box_data[index_j + 3] - + prior_box_data[index_j + 1] + norm_value; + float prior_box_center_x = prior_box_data[index_j] + prior_box_width / 2; + float prior_box_center_y = + prior_box_data[index_j + 1] + prior_box_height / 2; + + float target_box_center_x = + (target_box_data[index_i + 2] + target_box_data[index_i]) / 2; + float target_box_center_y = + (target_box_data[index_i + 3] + target_box_data[index_i + 1]) / 2; + float target_box_width = + target_box_data[index_i + 2] - target_box_data[index_i] + norm_value; + float target_box_height = target_box_data[index_i + 3] - + target_box_data[index_i + 1] + norm_value; + + output[offset] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[offset + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)); + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)); + output[offset] /= variance[index_j]; + output[offset + 1] /= variance[index_j + 1]; + output[offset + 2] /= variance[index_j + 2]; + output[offset + 3] /= variance[index_j + 3]; } + LITE_PARALLEL_END() } } @@ -216,7 +123,7 @@ void decode_bbox_center_kernel(const int batch_num, //! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax //! vvar float32x4_t vhalf = vdupq_n_f32(0.5f); - float norm_value = (normalized == false); + float norm_value = (normalized == false) ? 1.f : 0.f; float32x4_t vnormalized = vdupq_n_f32(norm_value); int len_batch = num_priors * 4; for (int n = 0; n < batch_num; ++n) { @@ -336,57 +243,58 @@ void decode_center_size_axis_1(const int var_size, const bool normalized, const std::vector variance, float* output) { - for (int i = 0; i < row; ++i) { - for (int j = 0; j < col; ++j) { - float var_data[4] = {1., 1., 1., 1.}; - float* var_ptr = var_data; - size_t offset = i * col * len + j * len; - int prior_box_offset = i * len; // axis == 0 ? j * len : i * len; - - float prior_box_width = prior_box_data[prior_box_offset + 2] - - prior_box_data[prior_box_offset] + - (normalized == false); - float prior_box_height = prior_box_data[prior_box_offset + 3] - - prior_box_data[prior_box_offset + 1] + - (normalized == false); - float prior_box_center_x = - prior_box_data[prior_box_offset] + prior_box_width / 2; - float prior_box_center_y = - prior_box_data[prior_box_offset + 1] + prior_box_height / 2; - - float target_box_center_x = 0, target_box_center_y = 0; - float target_box_width = 0, target_box_height = 0; - int prior_var_offset = i * len; // axis == 0 ? j * len : i * len; - if (var_size == 2) { - std::memcpy( - var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float)); - } else if (var_size == 1) { - var_ptr = const_cast(variance.data()); - } - float box_var_x = *var_ptr; - float box_var_y = *(var_ptr + 1); - float box_var_w = *(var_ptr + 2); - float box_var_h = *(var_ptr + 3); - - target_box_center_x = - box_var_x * target_box_data[offset] * prior_box_width + - prior_box_center_x; - target_box_center_y = - box_var_y * target_box_data[offset + 1] * prior_box_height + - prior_box_center_y; - target_box_width = - std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width; - target_box_height = - std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height; - - output[offset] = target_box_center_x - target_box_width / 2; - output[offset + 1] = target_box_center_y - target_box_height / 2; - output[offset + 2] = - target_box_center_x + target_box_width / 2 - (normalized == false); - output[offset + 3] = - target_box_center_y + target_box_height / 2 - (normalized == false); + float norm_value = (normalized == false) ? 1.f : 0.f; + int size = row * col; + LITE_PARALLEL_BEGIN(k, tid, size) { + int i = k / col; + int j = k % col; + float var_data[4] = {1., 1., 1., 1.}; + float* var_ptr = var_data; + size_t offset = i * col * len + j * len; + int prior_box_offset = i * len; // axis == 0 ? j * len : i * len; + + float prior_box_width = prior_box_data[prior_box_offset + 2] - + prior_box_data[prior_box_offset] + norm_value; + float prior_box_height = prior_box_data[prior_box_offset + 3] - + prior_box_data[prior_box_offset + 1] + norm_value; + float prior_box_center_x = + prior_box_data[prior_box_offset] + prior_box_width / 2; + float prior_box_center_y = + prior_box_data[prior_box_offset + 1] + prior_box_height / 2; + + float target_box_center_x = 0, target_box_center_y = 0; + float target_box_width = 0, target_box_height = 0; + int prior_var_offset = i * len; // axis == 0 ? j * len : i * len; + if (var_size == 2) { + std::memcpy( + var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float)); + } else if (var_size == 1) { + var_ptr = const_cast(variance.data()); } + float box_var_x = *var_ptr; + float box_var_y = *(var_ptr + 1); + float box_var_w = *(var_ptr + 2); + float box_var_h = *(var_ptr + 3); + + target_box_center_x = + box_var_x * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = + box_var_y * target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = + std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width; + target_box_height = + std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height; + + output[offset] = target_box_center_x - target_box_width / 2; + output[offset + 1] = target_box_center_y - target_box_height / 2; + output[offset + 2] = + target_box_center_x + target_box_width / 2 - norm_value; + output[offset + 3] = + target_box_center_y + target_box_height / 2 - norm_value; } + LITE_PARALLEL_END() } } // namespace math diff --git a/lite/backends/arm/math/box_coder.h b/lite/backends/arm/math/box_coder.h index 429b9e305a9..45851c11cac 100644 --- a/lite/backends/arm/math/box_coder.h +++ b/lite/backends/arm/math/box_coder.h @@ -28,6 +28,7 @@ void encode_bbox_center_kernel(const int batch_num, const float* prior_data, const float* variance, const bool var_len4, + const bool normalized, const int num_priors, float* bbox_data); diff --git a/lite/kernels/arm/box_coder_compute.cc b/lite/kernels/arm/box_coder_compute.cc index f2efa2aa09a..003cbc8440a 100644 --- a/lite/kernels/arm/box_coder_compute.cc +++ b/lite/kernels/arm/box_coder_compute.cc @@ -69,6 +69,7 @@ void BoxCoderCompute::Run() { prior_box_data, variance_data, var_len4, + normalized, col, output); } else if (code_type == "decode_center_size") { diff --git a/lite/tests/unittest_py/op/test_box_coder_op.py b/lite/tests/unittest_py/op/test_box_coder_op.py index 438e864b5ff..ec0f3c6f1c5 100644 --- a/lite/tests/unittest_py/op/test_box_coder_op.py +++ b/lite/tests/unittest_py/op/test_box_coder_op.py @@ -31,7 +31,6 @@ class TestBoxCoderOp(AutoScanTest): def __init__(self, *args, **kwargs): AutoScanTest.__init__(self, *args, **kwargs) - # precision has diff on arm self.enable_testing_on_place( TargetType.ARM, PrecisionType.FP32, @@ -134,7 +133,9 @@ def generate_targetbox(*args, **kwargs): return program_config def sample_predictor_configs(self): - return self.get_predictor_configs(), ["box_coder"], (1e-5, 1e-5) + # code_type = "encode_center_size", abs_error = 1e-4. out = out /variance + # code_type = "decode_center_size", abs_error=1e-5. + return self.get_predictor_configs(), ["box_coder"], (1e-4, 1e-4) def add_ignore_pass_case(self): def teller1(program_config, predictor_config): @@ -146,21 +147,10 @@ def teller1(program_config, predictor_config): 0].inputs: return True - def teller2(program_config, predictor_config): - if predictor_config.target() == TargetType.ARM: - if program_config.ops[0].attrs[ - "code_type"] == "encode_center_size": - return True - return False - self.add_ignore_check_case( teller1, IgnoreReasons.PADDLELITE_NOT_SUPPORT, "Lite is not supported on opencl. We need to fix it as soon as possible." ) - self.add_ignore_check_case( - teller2, IgnoreReasons.ACCURACY_ERROR, - "The op output has diff in a specific case on arm. We need to fix it as soon as possible." - ) def test(self, *args, **kwargs): target_str = self.get_target()