pytorch · NicolasHug · May 18, 2021 · May 18, 2021 · May 18, 2021 · May 18, 2021
diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -329,6 +329,8 @@ The images have to be loaded in to a range of ``[0, 1]`` and then normalized usi
 ``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``.
 They have been trained on images resized such that their minimum size is 520.
 
+For details on how to plot the masks of such models, you may refer to :ref:`semantic_seg_output`.
+
 The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are
 present in the Pascal VOC dataset. You can see more information on how the subset has been selected in
 ``references/segmentation/coco_utils.py``. The classes that the pre-trained model outputs are the following,
@@ -374,6 +376,7 @@ LR-ASPP
 
 .. autofunction:: torchvision.models.segmentation.lraspp_mobilenet_v3_large
 
+.. _object_det_inst_seg_pers_keypoint_det:
 
 Object Detection, Instance Segmentation and Person Keypoint Detection
 =====================================================================
@@ -392,7 +395,8 @@ in torchvision.
 
 The models expect a list of ``Tensor[C, H, W]``, in the range ``0-1``.
 The models internally resize the images but the behaviour varies depending
-on the model. Check the constructor of the models for more information.
+on the model. Check the constructor of the models for more information. The
+output format of such models is illustrated in :ref:`instance_seg_output`.
 
 
 For object detection and instance segmentation, the pre-trained

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
@@ -68,7 +68,8 @@ def show(imgs):
 # models.  Here is demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
 # model. You can also try using a RetinaNet with
-# :func:`~torchvision.models.detection.retinanet_resnet50_fpn`.
+# :func:`~torchvision.models.detection.retinanet_resnet50_fpn`. For more details
+# on the output of such models, you may refer to :ref:`instance_seg_output`.
 
 from torchvision.models.detection import fasterrcnn_resnet50_fpn
 from torchvision.transforms.functional import convert_image_dtype
@@ -87,9 +88,9 @@ def show(imgs):
 # Let's plot the boxes detected by our model. We will only plot the boxes with a
 # score greater than a given threshold.
 
-threshold = .8
+score_threshold = .8
 dogs_with_boxes = [
-    draw_bounding_boxes(dog_int, boxes=output['boxes'][output['scores'] > threshold], width=4)
+    draw_bounding_boxes(dog_int, boxes=output['boxes'][output['scores'] > score_threshold], width=4)
     for dog_int, output in zip(batch_int, outputs)
 ]
 show(dogs_with_boxes)
@@ -102,6 +103,8 @@ def show(imgs):
 # segmentation models have different outputs, so we will treat each
 # independently.
 #
+# .. _semantic_seg_output:
+#
 # Semantic segmentation models
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
@@ -237,6 +240,8 @@ def show(imgs):
 
 
 #####################################
+# .. _instance_seg_output:
+#
 # Instance segmentation models
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
@@ -245,6 +250,15 @@ def show(imgs):
 # models. Let's start by analyzing the output of a Mask-RCNN model. Note that
 # these models don't require the images to be normalized, so we don't need to
 # use the normalized batch.
+#
+# .. note::
+#
+#     We will here describe the output of a Mask-RCNN model. The models in
+#     :ref:`object_det_inst_seg_pers_keypoint_det` all have a similar output
+#     format, but some of them may have extra info like keypoints for
+#     :func:`~torchvision.models.detection.keypointrcnn_resnet50_fpn`, and some
+#     of them may not have masks, like
+#     :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`.
 
 from torchvision.models.detection import maskrcnn_resnet50_fpn
 model = maskrcnn_resnet50_fpn(pretrained=True, progress=False)
@@ -255,7 +269,7 @@ def show(imgs):
 
 #####################################
 # Let's break this down. For each image in the batch, the model outputs some
-# detections (or instances). The number of detection varies for each input
+# detections (or instances). The number of detections varies for each input
 # image. Each instance is described by its bounding box, its label, its score
 # and its mask.
 #

diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
@@ -317,12 +317,14 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
 
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
-    follows:
+    follows, where ``N`` is the number of detections:
 
         - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
-        - labels (``Int64Tensor[N]``): the predicted labels for each image
-        - scores (``Tensor[N]``): the scores or each prediction
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
 
     Faster R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
 
@@ -399,7 +401,9 @@ def fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=False, progress=True, num_c
                                           trainable_backbone_layers=None, **kwargs):
     """
     Constructs a low resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone tunned for mobile use-cases.
-    It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See `fasterrcnn_resnet50_fpn` for more details.
+    It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
+    :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
+    details.
 
     Example::
 
@@ -435,7 +439,9 @@ def fasterrcnn_mobilenet_v3_large_fpn(pretrained=False, progress=True, num_class
                                       trainable_backbone_layers=None, **kwargs):
     """
     Constructs a high resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone.
-    It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See `fasterrcnn_resnet50_fpn` for more details.
+    It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
+    :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
+    details.
 
     Example::
 

diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
@@ -297,14 +297,16 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
 
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
-    follows:
+    follows, where ``N`` is the number of detected instances:
 
         - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
-        - labels (``Int64Tensor[N]``): the predicted labels for each image
-        - scores (``Tensor[N]``): the scores or each prediction
+        - labels (``Int64Tensor[N]``): the predicted labels for each instance
+        - scores (``Tensor[N]``): the scores or each instance
         - keypoints (``FloatTensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format.
 
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
     Keypoint R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
 
     Example::

diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
@@ -289,16 +289,18 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
 
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
-    follows:
+    follows, where ``N`` is the number of detected instances:
 
         - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
-        - labels (``Int64Tensor[N]``): the predicted labels for each image
-        - scores (``Tensor[N]``): the scores or each prediction
+        - labels (``Int64Tensor[N]``): the predicted labels for each instance
+        - scores (``Tensor[N]``): the scores or each instance
         - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to
           obtain the final segmentation masks, the soft masks can be thresholded, generally
           with a value of 0.5 (``mask >= 0.5``)
 
+    For more details on the output and on how to plot the masks, you may refer to :ref:`instance_seg_output`.
+
     Mask R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
 
     Example::

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
@@ -586,12 +586,14 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
 
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
-    follows:
+    follows, where ``N`` is the number of detections:
 
         - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
-        - labels (``Int64Tensor[N]``): the predicted labels for each image
-        - scores (``Tensor[N]``): the scores or each prediction
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
 
     Example::
 

diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py
@@ -126,11 +126,12 @@ class SSD(nn.Module):
 
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
-    follows:
+    follows, where ``N`` is the number of detections:
+
         - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
-        - labels (Int64Tensor[N]): the predicted labels for each image
-        - scores (Tensor[N]): the scores for each prediction
+        - labels (Int64Tensor[N]): the predicted labels for each detection
+        - scores (Tensor[N]): the scores for each detection
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
@@ -520,8 +521,34 @@ def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained
 
 def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91,
                  pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any):
-    """
-    Constructs an SSD model with input size 300x300 and a VGG16 backbone. See `SSD` for more details.
+    """Constructs an SSD model with input size 300x300 and a VGG16 backbone.
+
+    Reference: `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    to a fixed size before passing it to the backbone.
+
+    The behavior of the model changes depending if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each detection
+        - scores (Tensor[N]): the scores for each detection
 
     Example:
 

diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
@@ -158,8 +158,9 @@ def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = Tru
                                   pretrained_backbone: bool = False, trainable_backbone_layers: Optional[int] = None,
                                   norm_layer: Optional[Callable[..., nn.Module]] = None,
                                   **kwargs: Any):
-    """
-    Constructs an SSDlite model with input size 320x320 and a MobileNetV3 Large backbone. See `SSD` for more details.
+    """Constructs an SSDlite model with input size 320x320 and a MobileNetV3 Large backbone.
+
+    See :func:`~torchvision.models.detection.ssd300_vgg16` for more details.
 
     Example: