Skip to content

Commit

Permalink
Fix cuda nms handling of additional per box features (#7483)
Browse files Browse the repository at this point in the history
  • Loading branch information
Trevor Morris authored Feb 22, 2021
1 parent 072c469 commit d666b41
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 7 deletions.
56 changes: 49 additions & 7 deletions python/tvm/topi/cuda/nms.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def nms_ir(
out_bboxes,
out_scores,
out_class_ids,
out_features,
box_indices,
num_valid_boxes,
max_output_size,
Expand Down Expand Up @@ -390,6 +391,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
batch_size = data.shape[0]
num_anchors = data.shape[1]
box_data_length = data.shape[2]
num_features = out_features.shape[2]

ib = tvm.tir.ir_builder.create()

Expand All @@ -402,6 +404,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
out_bboxes = ib.buffer_ptr(out_bboxes)
out_scores = ib.buffer_ptr(out_scores)
out_class_ids = ib.buffer_ptr(out_class_ids)
out_features = ib.buffer_ptr(out_features)
box_indices = ib.buffer_ptr(box_indices)
num_valid_boxes = ib.buffer_ptr(num_valid_boxes)

Expand All @@ -428,6 +431,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
i = by
base_src_idx = i * num_anchors * box_data_length
base_bbox_idx = i * num_anchors * 4
base_features_idx = i * num_anchors * num_features

with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
# Reorder output
Expand All @@ -439,6 +443,10 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length
with ib.for_range(0, 4, kind="unroll") as k:
out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k]
with ib.for_range(0, num_features, kind="unroll") as k:
out_features[(base_features_idx + j * num_features + k)] = data[
src_idx + coord_start + 4 + k
]

out_scores[i * num_anchors + j] = data[src_idx + score_index]

Expand All @@ -452,6 +460,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
with ib.if_scope(j < num_anchors):
with ib.for_range(0, 4, kind="unroll") as k:
out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0
with ib.for_range(0, num_features, kind="unroll") as k:
out_features[(base_features_idx + j * num_features + k)] = -1.0

out_scores[i, j] = -1.0

Expand All @@ -468,6 +478,10 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):

with ib.for_range(0, 4, kind="unroll") as k:
out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k]
with ib.for_range(0, num_features, kind="unroll") as k:
out_features[(base_features_idx + j * num_features + k)] = data[
src_offset + coord_start + 4 + k
]
out_scores[i * num_anchors + j] = data[src_offset + score_index]

if id_index >= 0:
Expand Down Expand Up @@ -649,16 +663,26 @@ def _run_nms(

batch_size = data.shape[0]
num_anchors = data.shape[1]
# Number of extra features per box beyond coords, score, and id.
num_features = data.shape[2] - 6 if id_index >= 0 else data.shape[2] - 5

# output shapes
bbox_shape = (batch_size, num_anchors, 4)
score_shape = (batch_size, num_anchors)
class_id_shape = score_shape
out_features_shape = (batch_size, num_anchors, num_features)
box_indices_shape = score_shape
num_valid_boxes_shape = (batch_size, 1)

return te.extern(
[bbox_shape, score_shape, class_id_shape, box_indices_shape, num_valid_boxes_shape],
[
bbox_shape,
score_shape,
class_id_shape,
out_features_shape,
box_indices_shape,
num_valid_boxes_shape,
],
[data, sort_tensor, valid_count, indices],
lambda ins, outs: nms_ir(
ins[0],
Expand All @@ -668,8 +692,9 @@ def _run_nms(
outs[0], # sorted bbox
outs[1], # sorted scores
outs[2], # sorted class ids
outs[3], # box_indices
outs[4], # num_valid_boxes
outs[3], # sorted box feats
outs[4], # box_indices
outs[5], # num_valid_boxes
max_output_size,
iou_threshold,
force_suppress,
Expand All @@ -679,19 +704,27 @@ def _run_nms(
score_index,
return_indices,
),
dtype=[data.dtype, "float32", "float32", "int32", "int32"],
dtype=[data.dtype, "float32", "float32", "float32", "int32", "int32"],
in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
name="nms",
tag="nms",
)


def _concatenate_outputs(
out_bboxes, out_scores, out_class_ids, out_shape, coord_start, score_index, id_index
out_bboxes,
out_scores,
out_class_ids,
out_features,
out_shape,
coord_start,
score_index,
id_index,
):
"""Pack the results from NMS into a single 5D or 6D tensor."""
batch_size = out_bboxes.shape[0]
num_anchors = out_bboxes.shape[1]
num_features = out_features.shape[2]

def ir(out_bboxes, out_scores, out_class_ids, out):
ib = tvm.tir.ir_builder.create()
Expand All @@ -718,6 +751,8 @@ def ir(out_bboxes, out_scores, out_class_ids, out):
with ib.if_scope(tid < num_anchors):
with ib.for_range(0, 4, kind="unroll") as j:
out[i, tid, coord_start + j] = out_bboxes[i, tid, j]
with ib.for_range(0, num_features, kind="unroll") as j:
out[i, tid, coord_start + 4 + j] = out_features[i, tid, j]
out[i, tid, score_index] = out_scores[i, tid]
if id_index >= 0:
out[i, tid, id_index] = out_class_ids[i, tid]
Expand Down Expand Up @@ -829,7 +864,7 @@ def non_max_suppression(

sort_tensor = _get_sorted_indices(data, data_buf, score_index, (data.shape[0], data.shape[1]))

out_bboxes, out_scores, out_class_ids, box_indices, num_valid_boxes = _run_nms(
out_bboxes, out_scores, out_class_ids, out_features, box_indices, num_valid_boxes = _run_nms(
data,
data_buf,
sort_tensor,
Expand All @@ -849,5 +884,12 @@ def non_max_suppression(
return [box_indices, num_valid_boxes]

return _concatenate_outputs(
out_bboxes, out_scores, out_class_ids, data.shape, coord_start, score_index, id_index
out_bboxes,
out_scores,
out_class_ids,
out_features,
data.shape,
coord_start,
score_index,
id_index,
)
36 changes: 36 additions & 0 deletions tests/python/relay/test_op_level5.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,42 @@ def verify_nms(
top_k=2,
)

np_data = np.array(
[
[
[0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4],
[1, 0.7, 30, 60, 50, 80, 5, 6, 7, 8],
[0, 0.4, 4, 21, 19, 40, 9, 10, 11, 12],
[2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16],
[1, 0.5, 100, 60, 70, 110, 17, 18, 19, 20],
]
]
).astype("float32")
np_result = np.array(
[
[
[2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16],
[0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4],
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
]
]
)
dshape = (1, 5, 10)
verify_nms(
np_data,
np_valid_count,
np_indices,
np_max_output_size,
dshape,
np_result,
np_indices_result,
force_suppress=True,
top_k=2,
check_type_only=False,
)


@tvm.testing.uses_gpu
def test_multibox_transform_loc():
Expand Down

0 comments on commit d666b41

Please sign in to comment.