Skip to content

Commit

Permalink
fix boxcoder compute error (PaddlePaddle#8271)
Browse files Browse the repository at this point in the history
  • Loading branch information
chenjiaoAngel authored and WeiLi233 committed Mar 29, 2022
1 parent 372eb78 commit 67e7e7a
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 235 deletions.
352 changes: 130 additions & 222 deletions lite/backends/arm/math/box_coder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,185 +21,92 @@ namespace lite {
namespace arm {
namespace math {
void encode_bbox_center_kernel(const int batch_num, // N
const float* loc_data,
const float* prior_data,
const float* target_box_data,
const float* prior_box_data,
const float* variance,
const bool var_len4,
const bool normalized,
const int num_priors, // M
float* bbox_data) {
int cnt = num_priors / 4;
//! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax
//! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax
//! vvar
float32x4_t vhalf = vdupq_n_f32(0.5f);

int len_batch = num_priors * 4;
float* output) {
int len = 4;
int size = batch_num * num_priors;
float norm_value = (normalized == false) ? 1 : 0;
if (var_len4) {
for (int n = 0; n < batch_num; ++n) {
const float* ptr_loc_batch = loc_data + n * len_batch;
float* ptr_bbox_batch = bbox_data + n * len_batch;

LITE_PARALLEL_BEGIN(i, tid, cnt) {
int idx = i * 16;
const float* ptr_loc = ptr_loc_batch + idx;
const float* ptr_prior = prior_data + idx;
float* ptr_bbox = ptr_bbox_batch + idx;

float32x4x4_t vprior = vld4q_f32(ptr_prior);
float32x4x4_t vloc = vld4q_f32(ptr_loc);
float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
float32x4_t vprior_cx =
vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf);
float32x4_t vprior_cy =
vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf);

float32x4_t vdec_bbx_cx =
vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx);
float32x4_t vdec_bbx_cy =
vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy);
float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]);
float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]);
vprior_width = vmulq_f32(vprior_width, vhalf);
vprior_height = vmulq_f32(vprior_height, vhalf);
vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width);
vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height);

vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w);
vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h);
vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w);
vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h);

vst4q_f32(ptr_bbox, vloc);
for (int k = 0; k < 4; k++) {
ptr_bbox[k] /= variance[k];
ptr_bbox[k + 4] /= variance[k];
ptr_bbox[k + 8] /= variance[k];
ptr_bbox[k + 12] /= variance[k];
}
}
LITE_PARALLEL_END()

LITE_PARALLEL_COMMON_BEGIN(i, tid, num_priors, cnt * 4, 1) {
int idx = i * 4;
float p_xmin = prior_data[idx];
float p_ymin = prior_data[idx + 1];
float p_xmax = prior_data[idx + 2];
float p_ymax = prior_data[idx + 3];
float prior_width = p_xmax - p_xmin;
float prior_height = p_ymax - p_ymin;
float prior_center_x = (p_xmin + p_xmax) / 2.f;
float prior_center_y = (p_ymin + p_ymax) / 2.f;

float xmin = ptr_loc_batch[idx];
float ymin = ptr_loc_batch[idx + 1];
float xmax = ptr_loc_batch[idx + 2];
float ymax = ptr_loc_batch[idx + 3];

//! variance is encoded in target, we simply need to retore the offset
//! predictions.
float decode_bbox_center_x = xmin * prior_width + prior_center_x;
float decode_bbox_center_y = ymin * prior_height + prior_center_y;
float decode_bbox_width = expf(xmax) * prior_width;
float decode_bbox_height = expf(ymax) * prior_height;

ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f;
ptr_bbox_batch[idx + 1] =
decode_bbox_center_y - decode_bbox_height / 2.f;
ptr_bbox_batch[idx + 2] =
decode_bbox_center_x + decode_bbox_width / 2.f;
ptr_bbox_batch[idx + 3] =
decode_bbox_center_y + decode_bbox_height / 2.f;
ptr_bbox_batch[idx] /= variance[0];
ptr_bbox_batch[idx + 1] /= variance[1];
ptr_bbox_batch[idx + 2] /= variance[2];
ptr_bbox_batch[idx + 3] /= variance[3];
}
LITE_PARALLEL_END()
LITE_PARALLEL_BEGIN(k, tid, size) {
int i = k / num_priors;
int j = k % num_priors;
size_t index_i = i * len;
size_t index_j = j * len;
size_t offset = k * len;
float prior_box_width =
prior_box_data[index_j + 2] - prior_box_data[index_j] + norm_value;
float prior_box_height = prior_box_data[index_j + 3] -
prior_box_data[index_j + 1] + norm_value;
float prior_box_center_x = prior_box_data[index_j] + prior_box_width / 2;
float prior_box_center_y =
prior_box_data[index_j + 1] + prior_box_height / 2;

float target_box_center_x =
(target_box_data[index_i + 2] + target_box_data[index_i]) / 2;
float target_box_center_y =
(target_box_data[index_i + 3] + target_box_data[index_i + 1]) / 2;
float target_box_width =
target_box_data[index_i + 2] - target_box_data[index_i] + norm_value;
float target_box_height = target_box_data[index_i + 3] -
target_box_data[index_i + 1] + norm_value;

output[offset] =
(target_box_center_x - prior_box_center_x) / prior_box_width;
output[offset + 1] =
(target_box_center_y - prior_box_center_y) / prior_box_height;
output[offset + 2] =
std::log(std::fabs(target_box_width / prior_box_width));
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height));
output[offset] /= variance[0];
output[offset + 1] /= variance[1];
output[offset + 2] /= variance[2];
output[offset + 3] /= variance[3];
}
LITE_PARALLEL_END()
} else {
for (int n = 0; n < batch_num; ++n) {
const float* ptr_loc_batch = loc_data + n * len_batch;
float* ptr_bbox_batch = bbox_data + n * len_batch;

LITE_PARALLEL_BEGIN(i, tid, cnt) {
int idx = i * 16;
const float* ptr_loc = ptr_loc_batch + idx;
const float* ptr_prior = prior_data + idx;
float* ptr_bbox = ptr_bbox_batch + idx;

float32x4x4_t vprior = vld4q_f32(ptr_prior);
float32x4x4_t vloc = vld4q_f32(ptr_loc);
float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
float32x4_t vprior_cx =
vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf);
float32x4_t vprior_cy =
vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf);

float32x4_t vdec_bbx_cx =
vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx);
float32x4_t vdec_bbx_cy =
vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy);
float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]);
float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]);
vprior_width = vmulq_f32(vprior_width, vhalf);
vprior_height = vmulq_f32(vprior_height, vhalf);
vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width);
vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height);

vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w);
vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h);
vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w);
vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h);

vst4q_f32(ptr_bbox, vloc);
for (int k = 0; k < 4; k++) {
ptr_bbox[k] /= variance[idx + k];
ptr_bbox[k + 4] /= variance[idx + k];
ptr_bbox[k + 8] /= variance[idx + k];
ptr_bbox[k + 12] /= variance[idx + k];
}
}
LITE_PARALLEL_END()

LITE_PARALLEL_COMMON_BEGIN(i, tid, num_priors, cnt * 4, 1) {
int idx = i * 4;
float p_xmin = prior_data[idx];
float p_ymin = prior_data[idx + 1];
float p_xmax = prior_data[idx + 2];
float p_ymax = prior_data[idx + 3];
float prior_width = p_xmax - p_xmin;
float prior_height = p_ymax - p_ymin;
float prior_center_x = (p_xmin + p_xmax) / 2.f;
float prior_center_y = (p_ymin + p_ymax) / 2.f;

float xmin = ptr_loc_batch[idx];
float ymin = ptr_loc_batch[idx + 1];
float xmax = ptr_loc_batch[idx + 2];
float ymax = ptr_loc_batch[idx + 3];

//! variance is encoded in target, we simply need to retore the offset
//! predictions.
float decode_bbox_center_x = xmin * prior_width + prior_center_x;
float decode_bbox_center_y = ymin * prior_height + prior_center_y;
float decode_bbox_width = expf(xmax) * prior_width;
float decode_bbox_height = expf(ymax) * prior_height;

ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f;
ptr_bbox_batch[idx + 1] =
decode_bbox_center_y - decode_bbox_height / 2.f;
ptr_bbox_batch[idx + 2] =
decode_bbox_center_x + decode_bbox_width / 2.f;
ptr_bbox_batch[idx + 3] =
decode_bbox_center_y + decode_bbox_height / 2.f;
ptr_bbox_batch[idx] /= variance[idx];
ptr_bbox_batch[idx + 1] /= variance[idx + 1];
ptr_bbox_batch[idx + 2] /= variance[idx + 2];
ptr_bbox_batch[idx + 3] /= variance[idx + 3];
}
LITE_PARALLEL_END()
LITE_PARALLEL_BEGIN(k, tid, size) {
int i = k / num_priors;
int j = k % num_priors;
size_t index_i = i * len;
size_t index_j = j * len;
size_t offset = k * len;
float prior_box_width =
prior_box_data[index_j + 2] - prior_box_data[index_j] + norm_value;
float prior_box_height = prior_box_data[index_j + 3] -
prior_box_data[index_j + 1] + norm_value;
float prior_box_center_x = prior_box_data[index_j] + prior_box_width / 2;
float prior_box_center_y =
prior_box_data[index_j + 1] + prior_box_height / 2;

float target_box_center_x =
(target_box_data[index_i + 2] + target_box_data[index_i]) / 2;
float target_box_center_y =
(target_box_data[index_i + 3] + target_box_data[index_i + 1]) / 2;
float target_box_width =
target_box_data[index_i + 2] - target_box_data[index_i] + norm_value;
float target_box_height = target_box_data[index_i + 3] -
target_box_data[index_i + 1] + norm_value;

output[offset] =
(target_box_center_x - prior_box_center_x) / prior_box_width;
output[offset + 1] =
(target_box_center_y - prior_box_center_y) / prior_box_height;
output[offset + 2] =
std::log(std::fabs(target_box_width / prior_box_width));
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height));
output[offset] /= variance[index_j];
output[offset + 1] /= variance[index_j + 1];
output[offset + 2] /= variance[index_j + 2];
output[offset + 3] /= variance[index_j + 3];
}
LITE_PARALLEL_END()
}
}

Expand All @@ -216,7 +123,7 @@ void decode_bbox_center_kernel(const int batch_num,
//! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax
//! vvar
float32x4_t vhalf = vdupq_n_f32(0.5f);
float norm_value = (normalized == false);
float norm_value = (normalized == false) ? 1.f : 0.f;
float32x4_t vnormalized = vdupq_n_f32(norm_value);
int len_batch = num_priors * 4;
for (int n = 0; n < batch_num; ++n) {
Expand Down Expand Up @@ -336,57 +243,58 @@ void decode_center_size_axis_1(const int var_size,
const bool normalized,
const std::vector<float> variance,
float* output) {
for (int i = 0; i < row; ++i) {
for (int j = 0; j < col; ++j) {
float var_data[4] = {1., 1., 1., 1.};
float* var_ptr = var_data;
size_t offset = i * col * len + j * len;
int prior_box_offset = i * len; // axis == 0 ? j * len : i * len;

float prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] +
(normalized == false);
float prior_box_height = prior_box_data[prior_box_offset + 3] -
prior_box_data[prior_box_offset + 1] +
(normalized == false);
float prior_box_center_x =
prior_box_data[prior_box_offset] + prior_box_width / 2;
float prior_box_center_y =
prior_box_data[prior_box_offset + 1] + prior_box_height / 2;

float target_box_center_x = 0, target_box_center_y = 0;
float target_box_width = 0, target_box_height = 0;
int prior_var_offset = i * len; // axis == 0 ? j * len : i * len;
if (var_size == 2) {
std::memcpy(
var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
} else if (var_size == 1) {
var_ptr = const_cast<float*>(variance.data());
}
float box_var_x = *var_ptr;
float box_var_y = *(var_ptr + 1);
float box_var_w = *(var_ptr + 2);
float box_var_h = *(var_ptr + 3);

target_box_center_x =
box_var_x * target_box_data[offset] * prior_box_width +
prior_box_center_x;
target_box_center_y =
box_var_y * target_box_data[offset + 1] * prior_box_height +
prior_box_center_y;
target_box_width =
std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
target_box_height =
std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;

output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] =
target_box_center_x + target_box_width / 2 - (normalized == false);
output[offset + 3] =
target_box_center_y + target_box_height / 2 - (normalized == false);
float norm_value = (normalized == false) ? 1.f : 0.f;
int size = row * col;
LITE_PARALLEL_BEGIN(k, tid, size) {
int i = k / col;
int j = k % col;
float var_data[4] = {1., 1., 1., 1.};
float* var_ptr = var_data;
size_t offset = i * col * len + j * len;
int prior_box_offset = i * len; // axis == 0 ? j * len : i * len;

float prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] + norm_value;
float prior_box_height = prior_box_data[prior_box_offset + 3] -
prior_box_data[prior_box_offset + 1] + norm_value;
float prior_box_center_x =
prior_box_data[prior_box_offset] + prior_box_width / 2;
float prior_box_center_y =
prior_box_data[prior_box_offset + 1] + prior_box_height / 2;

float target_box_center_x = 0, target_box_center_y = 0;
float target_box_width = 0, target_box_height = 0;
int prior_var_offset = i * len; // axis == 0 ? j * len : i * len;
if (var_size == 2) {
std::memcpy(
var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
} else if (var_size == 1) {
var_ptr = const_cast<float*>(variance.data());
}
float box_var_x = *var_ptr;
float box_var_y = *(var_ptr + 1);
float box_var_w = *(var_ptr + 2);
float box_var_h = *(var_ptr + 3);

target_box_center_x =
box_var_x * target_box_data[offset] * prior_box_width +
prior_box_center_x;
target_box_center_y =
box_var_y * target_box_data[offset + 1] * prior_box_height +
prior_box_center_y;
target_box_width =
std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
target_box_height =
std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;

output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] =
target_box_center_x + target_box_width / 2 - norm_value;
output[offset + 3] =
target_box_center_y + target_box_height / 2 - norm_value;
}
LITE_PARALLEL_END()
}

} // namespace math
Expand Down
1 change: 1 addition & 0 deletions lite/backends/arm/math/box_coder.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ void encode_bbox_center_kernel(const int batch_num,
const float* prior_data,
const float* variance,
const bool var_len4,
const bool normalized,
const int num_priors,
float* bbox_data);

Expand Down
1 change: 1 addition & 0 deletions lite/kernels/arm/box_coder_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ void BoxCoderCompute::Run() {
prior_box_data,
variance_data,
var_len4,
normalized,
col,
output);
} else if (code_type == "decode_center_size") {
Expand Down
Loading

0 comments on commit 67e7e7a

Please sign in to comment.