Fix cuda nms handling of additional per box features (#7483)

apache · Feb 22, 2021 · d666b41 · d666b41
1 parent 072c469
commit d666b41
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 7 deletions.
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
@@ -272,6 +272,7 @@ def nms_ir(
     out_bboxes,
     out_scores,
     out_class_ids,
+    out_features,
     box_indices,
     num_valid_boxes,
     max_output_size,
@@ -390,6 +391,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
+    num_features = out_features.shape[2]
 
     ib = tvm.tir.ir_builder.create()
 
@@ -402,6 +404,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     out_bboxes = ib.buffer_ptr(out_bboxes)
     out_scores = ib.buffer_ptr(out_scores)
     out_class_ids = ib.buffer_ptr(out_class_ids)
+    out_features = ib.buffer_ptr(out_features)
     box_indices = ib.buffer_ptr(box_indices)
     num_valid_boxes = ib.buffer_ptr(num_valid_boxes)
 
@@ -428,6 +431,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         i = by
         base_src_idx = i * num_anchors * box_data_length
         base_bbox_idx = i * num_anchors * 4
+        base_features_idx = i * num_anchors * num_features
 
         with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
             # Reorder output
@@ -439,6 +443,10 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                 src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length
                 with ib.for_range(0, 4, kind="unroll") as k:
                     out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k]
+                with ib.for_range(0, num_features, kind="unroll") as k:
+                    out_features[(base_features_idx + j * num_features + k)] = data[
+                        src_idx + coord_start + 4 + k
+                    ]
 
                 out_scores[i * num_anchors + j] = data[src_idx + score_index]
 
@@ -452,6 +460,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                     with ib.if_scope(j < num_anchors):
                         with ib.for_range(0, 4, kind="unroll") as k:
                             out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0
+                        with ib.for_range(0, num_features, kind="unroll") as k:
+                            out_features[(base_features_idx + j * num_features + k)] = -1.0
 
                         out_scores[i, j] = -1.0
 
@@ -468,6 +478,10 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
                 with ib.for_range(0, 4, kind="unroll") as k:
                     out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k]
+                with ib.for_range(0, num_features, kind="unroll") as k:
+                    out_features[(base_features_idx + j * num_features + k)] = data[
+                        src_offset + coord_start + 4 + k
+                    ]
                 out_scores[i * num_anchors + j] = data[src_offset + score_index]
 
                 if id_index >= 0:
@@ -649,16 +663,26 @@ def _run_nms(
 
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
+    # Number of extra features per box beyond coords, score, and id.
+    num_features = data.shape[2] - 6 if id_index >= 0 else data.shape[2] - 5
 
     # output shapes
     bbox_shape = (batch_size, num_anchors, 4)
     score_shape = (batch_size, num_anchors)
     class_id_shape = score_shape
+    out_features_shape = (batch_size, num_anchors, num_features)
     box_indices_shape = score_shape
     num_valid_boxes_shape = (batch_size, 1)
 
     return te.extern(
-        [bbox_shape, score_shape, class_id_shape, box_indices_shape, num_valid_boxes_shape],
+        [
+            bbox_shape,
+            score_shape,
+            class_id_shape,
+            out_features_shape,
+            box_indices_shape,
+            num_valid_boxes_shape,
+        ],
         [data, sort_tensor, valid_count, indices],
         lambda ins, outs: nms_ir(
             ins[0],
@@ -668,8 +692,9 @@ def _run_nms(
             outs[0],  # sorted bbox
             outs[1],  # sorted scores
             outs[2],  # sorted class ids
-            outs[3],  # box_indices
-            outs[4],  # num_valid_boxes
+            outs[3],  # sorted box feats
+            outs[4],  # box_indices
+            outs[5],  # num_valid_boxes
             max_output_size,
             iou_threshold,
             force_suppress,
@@ -679,19 +704,27 @@ def _run_nms(
             score_index,
             return_indices,
         ),
-        dtype=[data.dtype, "float32", "float32", "int32", "int32"],
+        dtype=[data.dtype, "float32", "float32", "float32", "int32", "int32"],
         in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
         name="nms",
         tag="nms",
     )
 
 
 def _concatenate_outputs(
-    out_bboxes, out_scores, out_class_ids, out_shape, coord_start, score_index, id_index
+    out_bboxes,
+    out_scores,
+    out_class_ids,
+    out_features,
+    out_shape,
+    coord_start,
+    score_index,
+    id_index,
 ):
     """Pack the results from NMS into a single 5D or 6D tensor."""
     batch_size = out_bboxes.shape[0]
     num_anchors = out_bboxes.shape[1]
+    num_features = out_features.shape[2]
 
     def ir(out_bboxes, out_scores, out_class_ids, out):
         ib = tvm.tir.ir_builder.create()
@@ -718,6 +751,8 @@ def ir(out_bboxes, out_scores, out_class_ids, out):
             with ib.if_scope(tid < num_anchors):
                 with ib.for_range(0, 4, kind="unroll") as j:
                     out[i, tid, coord_start + j] = out_bboxes[i, tid, j]
+                with ib.for_range(0, num_features, kind="unroll") as j:
+                    out[i, tid, coord_start + 4 + j] = out_features[i, tid, j]
                 out[i, tid, score_index] = out_scores[i, tid]
                 if id_index >= 0:
                     out[i, tid, id_index] = out_class_ids[i, tid]
@@ -829,7 +864,7 @@ def non_max_suppression(
 
     sort_tensor = _get_sorted_indices(data, data_buf, score_index, (data.shape[0], data.shape[1]))
 
-    out_bboxes, out_scores, out_class_ids, box_indices, num_valid_boxes = _run_nms(
+    out_bboxes, out_scores, out_class_ids, out_features, box_indices, num_valid_boxes = _run_nms(
         data,
         data_buf,
         sort_tensor,
@@ -849,5 +884,12 @@ def non_max_suppression(
         return [box_indices, num_valid_boxes]
 
     return _concatenate_outputs(
-        out_bboxes, out_scores, out_class_ids, data.shape, coord_start, score_index, id_index
+        out_bboxes,
+        out_scores,
+        out_class_ids,
+        out_features,
+        data.shape,
+        coord_start,
+        score_index,
+        id_index,
     )
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
@@ -488,6 +488,42 @@ def verify_nms(
         top_k=2,
     )
 
+    np_data = np.array(
+        [
+            [
+                [0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4],
+                [1, 0.7, 30, 60, 50, 80, 5, 6, 7, 8],
+                [0, 0.4, 4, 21, 19, 40, 9, 10, 11, 12],
+                [2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16],
+                [1, 0.5, 100, 60, 70, 110, 17, 18, 19, 20],
+            ]
+        ]
+    ).astype("float32")
+    np_result = np.array(
+        [
+            [
+                [2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16],
+                [0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4],
+                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            ]
+        ]
+    )
+    dshape = (1, 5, 10)
+    verify_nms(
+        np_data,
+        np_valid_count,
+        np_indices,
+        np_max_output_size,
+        dshape,
+        np_result,
+        np_indices_result,
+        force_suppress=True,
+        top_k=2,
+        check_type_only=False,
+    )
+
 
 @tvm.testing.uses_gpu
 def test_multibox_transform_loc():