diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index 4f9f95aa868..f992b49fdc4 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -31,7 +31,7 @@ class MaskRCNN(FasterRCNN):
         - boxes (Tensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
           between 0 and H and 0 and W
         - labels (Tensor[N]): the class label for each ground-truth box
-        - masks (Tensor[N, H, W]): the segmentation binary masks for each instance
+        - masks (Tensor[N, 1, H, W]): the segmentation binary masks for each instance
 
     The model returns a Dict[Tensor] during training, containing the classification and regression
     losses for both the RPN and the R-CNN, and the mask loss.
@@ -43,7 +43,7 @@ class MaskRCNN(FasterRCNN):
           0 and H and 0 and W
         - labels (Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
-        - masks (Tensor[N, H, W]): the predicted masks for each instance, in 0-1 range. In order to
+        - masks (Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
           obtain the final segmentation masks, the soft masks can be thresholded, generally
           with a value of 0.5 (mask >= 0.5)