open-mmlab · ZwwWayne · Dec 20, 2022 · Nov 15, 2022 · Dec 5, 2022 · Dec 9, 2022
diff --git a/configs/yolox/yolox_s_8xb8-300e_coco.py b/configs/yolox/yolox_s_8xb8-300e_coco.py
@@ -1,6 +1,6 @@
 _base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py']
 
-img_scale = (640, 640)  # height, width
+img_scale = (640, 640)  # width, height
 
 # model settings
 model = dict(
@@ -83,6 +83,7 @@
     dict(
         type='RandomAffine',
         scaling_ratio_range=(0.1, 2),
+        # img_scale is (width, height)
         border=(-img_scale[0] // 2, -img_scale[1] // 2)),
     dict(
         type='MixUp',

diff --git a/configs/yolox/yolox_tiny_8xb8-300e_coco.py b/configs/yolox/yolox_tiny_8xb8-300e_coco.py
@@ -13,7 +13,7 @@
     neck=dict(in_channels=[96, 192, 384], out_channels=96),
     bbox_head=dict(in_channels=96, feat_channels=96))
 
-img_scale = (640, 640)  # height, width
+img_scale = (640, 640)  # width, height
 
 # file_client_args = dict(
 #     backend='petrel',
@@ -28,6 +28,7 @@
     dict(
         type='RandomAffine',
         scaling_ratio_range=(0.5, 1.5),
+        # img_scale is (width, height)
         border=(-img_scale[0] // 2, -img_scale[1] // 2)),
     dict(type='YOLOXHSVRandomAug'),
     dict(type='RandomFlip', prob=0.5),

diff --git a/docs/en/advanced_guides/conventions.md b/docs/en/advanced_guides/conventions.md
@@ -2,6 +2,38 @@
 
 Please check the following conventions if you would like to modify MMDetection as your own project.
 
+## Image pipeline input shape
+
+In OpenMMLab 2.0, the image pipeline input shape is `(width, height)`, the shape of the pipeline output is `(height, width)` and the shape is also always `(height, width)` in model. The common fields are as follows:
+
+- img_shape: (height, width)
+- ori_shape: (height, width)
+- pad_shape: (height, width)
+- batch_input_shape: (height, width)
+
+As an example, the initialization parameters of `Mosaic` are as follows:
+
+```python
+@TRANSFORMS.register_module()
+class Mosaic(BaseTransform):
+    def __init__(self,
+                img_scale: Tuple[int, int] = (640, 640),
+                center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                bbox_clip_border: bool = True,
+                pad_val: float = 114.0,
+                prob: float = 1.0) -> None:
+       ...
+
+       # img_scale order should be (width, height)
+       self.img_scale = img_scale
+
+    def transform(self, results: dict) -> dict:
+        ...
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape[:2]
+```
+
 ## Loss
 
 In MMDetection, a `dict` containing losses and metrics will be returned by `model(**data)`.

diff --git a/docs/zh_cn/advanced_guides/conventions.md b/docs/zh_cn/advanced_guides/conventions.md
@@ -2,6 +2,38 @@
 
 如果你想把 MMDetection 修改为自己的项目，请遵循下面的约定。
 
+## 图片 pipeline 输入 shape
+
+在 OpenMMLab 2.0 中图片处理 pipeline 输入参数中关于 shape 值都是 `(width, height)`, pipeline 输出的 shape 值都是 `(height, width)`, 在模型中 shape 始终是  `(height, width)`. 常见字段如下：
+
+- img_shape: (height, width)
+- ori_shape: (height, width)
+- pad_shape: (height, width)
+- batch_input_shape: (height, width)
+
+以 `Mosaic` 为例，其初始化参数如下所示：
+
+```python
+@TRANSFORMS.register_module()
+class Mosaic(BaseTransform):
+    def __init__(self,
+                img_scale: Tuple[int, int] = (640, 640),
+                center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                bbox_clip_border: bool = True,
+                pad_val: float = 114.0,
+                prob: float = 1.0) -> None:
+       ...
+
+       # img_scale 顺序应该是 (width, height)
+       self.img_scale = img_scale
+
+    def transform(self, results: dict) -> dict:
+        ...
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape[:2]
+```
+
 ## 损失
 
 在 MMDetection 中，`model(**data)` 的返回值是一个字典，包含着所有的损失和评价指标，他们将会由 `model(**data)` 返回。

diff --git a/mmdet/datasets/transforms/formatting.py b/mmdet/datasets/transforms/formatting.py
@@ -21,10 +21,10 @@ class PackDetInputs(BaseTransform):
 
         - ``img_path``: path to the image file
 
-        - ``ori_shape``: original shape of the image as a tuple (h, w, c)
+        - ``ori_shape``: original shape of the image as a tuple (h, w)
 
         - ``img_shape``: shape of the image input to the network as a tuple \
-            (h, w, c).  Note that images may be zero padded on the \
+            (h, w).  Note that images may be zero padded on the \
             bottom/right if the batch tensor is larger than this shape.
 
         - ``scale_factor``: a float indicating the preprocessing scale