From 93722bcb0cc4e43af1fe955aeabf18e7505719bf Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 22 Feb 2021 13:55:59 +0000 Subject: [PATCH 1/3] Specify coordinate constraints --- torchvision/ops/boxes.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py index 2cb1be93168..c0059062aaa 100644 --- a/torchvision/ops/boxes.py +++ b/torchvision/ops/boxes.py @@ -22,7 +22,8 @@ def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor: Args: boxes (Tensor[N, 4])): boxes to perform NMS on. They - are expected to be in (x1, y1, x2, y2) format + are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and + ``0 <= y1 < y2``. scores (Tensor[N]): scores for each one of the boxes iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold @@ -50,7 +51,8 @@ def batched_nms( Args: boxes (Tensor[N, 4]): boxes where NMS will be performed. They - are expected to be in (x1, y1, x2, y2) format + are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and + ``0 <= y1 < y2``. scores (Tensor[N]): scores for each one of the boxes idxs (Tensor[N]): indices of the categories for each one of the boxes. iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold @@ -79,7 +81,8 @@ def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor: Remove boxes which contains at least one side smaller than min_size. Args: - boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format + boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format + with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. min_size (float): minimum size Returns: @@ -97,7 +100,8 @@ def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor: Clip boxes so that they lie inside an image of size `size`. Args: - boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format + boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format + with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. size (Tuple[height, width]): size of the image Returns: @@ -185,7 +189,8 @@ def box_area(boxes: Tensor) -> Tensor: Args: boxes (Tensor[N, 4]): boxes for which the area will be computed. They - are expected to be in (x1, y1, x2, y2) format + are expected to be in (x1, y1, x2, y2) format with + ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Returns: area (Tensor[N]): area for each box @@ -215,7 +220,8 @@ def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: """ Return intersection-over-union (Jaccard index) of boxes. - Both sets of boxes are expected to be in (x1, y1, x2, y2) format. + Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with + ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: boxes1 (Tensor[N, 4]) @@ -234,7 +240,8 @@ def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: """ Return generalized intersection-over-union (Jaccard index) of boxes. - Both sets of boxes are expected to be in (x1, y1, x2, y2) format. + Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with + ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Args: boxes1 (Tensor[N, 4]) From f0e9397edcc3768b3cd7804310133b8cf34a08c8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 22 Feb 2021 14:10:24 +0000 Subject: [PATCH 2/3] some more --- torchvision/models/detection/faster_rcnn.py | 16 ++++++++-------- torchvision/models/detection/keypoint_rcnn.py | 16 ++++++++-------- torchvision/models/detection/mask_rcnn.py | 16 ++++++++-------- torchvision/models/detection/retinanet.py | 16 ++++++++-------- torchvision/ops/poolers.py | 2 +- torchvision/ops/ps_roi_align.py | 4 +++- torchvision/ops/ps_roi_pool.py | 4 +++- torchvision/ops/roi_align.py | 4 +++- torchvision/ops/roi_pool.py | 4 +++- 9 files changed, 45 insertions(+), 37 deletions(-) diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py index 0599d1da484..6781c965d18 100644 --- a/torchvision/models/detection/faster_rcnn.py +++ b/torchvision/models/detection/faster_rcnn.py @@ -32,8 +32,8 @@ class FasterRCNN(GeneralizedRCNN): During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values of x - between 0 and W and values of y between 0 and H + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the class label for each ground-truth box The model returns a Dict[Tensor] during training, containing the classification and regression @@ -42,8 +42,8 @@ class FasterRCNN(GeneralizedRCNN): During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values of x - between 0 and W and values of y between 0 and H + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores or each prediction @@ -309,8 +309,8 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True, During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values of ``x`` - between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H`` + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the class label for each ground-truth box The model returns a ``Dict[Tensor]`` during training, containing the classification and regression @@ -320,8 +320,8 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True, predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows: - - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values of ``x`` - between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H`` + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the predicted labels for each image - scores (``Tensor[N]``): the scores or each prediction diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py index f784273f5c2..0d460ade27c 100644 --- a/torchvision/models/detection/keypoint_rcnn.py +++ b/torchvision/models/detection/keypoint_rcnn.py @@ -27,8 +27,8 @@ class KeypointRCNN(FasterRCNN): During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values of x - between 0 and W and values of y between 0 and H + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the class label for each ground-truth box - keypoints (FloatTensor[N, K, 3]): the K keypoints location for each of the N instances, in the format [x, y, visibility], where visibility=0 means that the keypoint is not visible. @@ -40,8 +40,8 @@ class KeypointRCNN(FasterRCNN): predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values of x - between 0 and W and values of y between 0 and H + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores or each prediction - keypoints (FloatTensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format. @@ -286,8 +286,8 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True, During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values of ``x`` - between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H`` + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the class label for each ground-truth box - keypoints (``FloatTensor[N, K, 3]``): the ``K`` keypoints location for each of the ``N`` instances, in the format ``[x, y, visibility]``, where ``visibility=0`` means that the keypoint is not visible. @@ -299,8 +299,8 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True, predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows: - - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values of ``x`` - between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H`` + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the predicted labels for each image - scores (``Tensor[N]``): the scores or each prediction - keypoints (``FloatTensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format. diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py index 09be4fa684c..1e6fb77f07a 100644 --- a/torchvision/models/detection/mask_rcnn.py +++ b/torchvision/models/detection/mask_rcnn.py @@ -26,8 +26,8 @@ class MaskRCNN(FasterRCNN): During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values of x - between 0 and W and values of y between 0 and H + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the class label for each ground-truth box - masks (UInt8Tensor[N, H, W]): the segmentation binary masks for each instance @@ -37,8 +37,8 @@ class MaskRCNN(FasterRCNN): During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values of x - between 0 and W and values of y between 0 and H + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores or each prediction - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to @@ -279,8 +279,8 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True, During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values of ``x`` - between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H`` + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the class label for each ground-truth box - masks (``UInt8Tensor[N, H, W]``): the segmentation binary masks for each instance @@ -291,8 +291,8 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True, predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows: - - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values of ``x`` - between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H`` + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the predicted labels for each image - scores (``Tensor[N]``): the scores or each prediction - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 5c2850e8834..f34db4ce970 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -236,8 +236,8 @@ class RetinaNet(nn.Module): During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values - between 0 and H and 0 and W + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the class label for each ground-truth box The model returns a Dict[Tensor] during training, containing the classification and regression @@ -246,8 +246,8 @@ class RetinaNet(nn.Module): During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between - 0 and H and 0 and W + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores for each prediction @@ -576,8 +576,8 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True, During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values - between ``0`` and ``H`` and ``0`` and ``W`` + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the class label for each ground-truth box The model returns a ``Dict[Tensor]`` during training, containing the classification and regression @@ -587,8 +587,8 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True, predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows: - - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values between - ``0`` and ``H`` and ``0`` and ``W`` + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the predicted labels for each image - scores (``Tensor[N]``): the scores or each prediction diff --git a/torchvision/ops/poolers.py b/torchvision/ops/poolers.py index 25888afea76..a0ba5b42774 100644 --- a/torchvision/ops/poolers.py +++ b/torchvision/ops/poolers.py @@ -204,7 +204,7 @@ def forward( all the same number of channels, but they can have different sizes. boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in (x1, y1, x2, y2) format and in the image reference size, not the feature map - reference. + reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. image_shapes (List[Tuple[height, width]]): the sizes of each image before they have been fed to a CNN to obtain feature maps. This allows us to infer the scale factor for each one of the levels to be pooled. diff --git a/torchvision/ops/ps_roi_align.py b/torchvision/ops/ps_roi_align.py index 46bcdbe4d91..d14f429785a 100644 --- a/torchvision/ops/ps_roi_align.py +++ b/torchvision/ops/ps_roi_align.py @@ -21,7 +21,9 @@ def ps_roi_align( Args: input (Tensor[N, C, H, W]): input tensor boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) - format where the regions will be taken from. If a single Tensor is passed, + format where the regions will be taken from. + The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. + If a single Tensor is passed, then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in a batch diff --git a/torchvision/ops/ps_roi_pool.py b/torchvision/ops/ps_roi_pool.py index f434fbd0b9f..8c07eb864a8 100644 --- a/torchvision/ops/ps_roi_pool.py +++ b/torchvision/ops/ps_roi_pool.py @@ -20,7 +20,9 @@ def ps_roi_pool( Args: input (Tensor[N, C, H, W]): input tensor boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) - format where the regions will be taken from. If a single Tensor is passed, + format where the regions will be taken from. + The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. + If a single Tensor is passed, then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in a batch diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py index 81453ff921a..0f6c0be1729 100644 --- a/torchvision/ops/roi_align.py +++ b/torchvision/ops/roi_align.py @@ -22,7 +22,9 @@ def roi_align( Args: input (Tensor[N, C, H, W]): input tensor boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) - format where the regions will be taken from. If a single Tensor is passed, + format where the regions will be taken from. + The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. + If a single Tensor is passed, then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in a batch diff --git a/torchvision/ops/roi_pool.py b/torchvision/ops/roi_pool.py index 9c150099455..fce6392fbfd 100644 --- a/torchvision/ops/roi_pool.py +++ b/torchvision/ops/roi_pool.py @@ -20,7 +20,9 @@ def roi_pool( Args: input (Tensor[N, C, H, W]): input tensor boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) - format where the regions will be taken from. If a single Tensor is passed, + format where the regions will be taken from. + The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. + If a single Tensor is passed, then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in a batch From a51aca37a32a5771619392a8040b06f5d17be0ea Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 22 Feb 2021 14:15:51 +0000 Subject: [PATCH 3/3] flake8 --- torchvision/ops/boxes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py index c0059062aaa..cfce618845a 100644 --- a/torchvision/ops/boxes.py +++ b/torchvision/ops/boxes.py @@ -189,7 +189,7 @@ def box_area(boxes: Tensor) -> Tensor: Args: boxes (Tensor[N, 4]): boxes for which the area will be computed. They - are expected to be in (x1, y1, x2, y2) format with + are expected to be in (x1, y1, x2, y2) format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. Returns: