Merge branch 'dev' into dev

open-mmlab · Aug 4, 2022 · 31dbe12 · 31dbe12
2 parents d58b4f8 + b5bb86a
commit 31dbe12
Show file tree

Hide file tree

Showing 65 changed files with 3,086 additions and 166 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,4 @@
+include requirements/*.txt
 include mmcls/.mim/model-index.yml
 recursive-include mmcls/.mim/configs *.py *.yml
 recursive-include mmcls/.mim/tools *.py *.sh
diff --git a/README.md b/README.md
@@ -142,6 +142,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea
 - [x] [ConvMixer](https://github.com/open-mmlab/mmclassification/tree/master/configs/convmixer)
 - [x] [CSPNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/cspnet)
 - [x] [PoolFormer](https://github.com/open-mmlab/mmclassification/tree/master/configs/poolformer)
+- [x] [MViT](https://github.com/open-mmlab/mmclassification/tree/master/configs/mvit)
 
 </details>
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -142,6 +142,7 @@ pip3 install -e .
 - [x] [ConvMixer](https://github.com/open-mmlab/mmclassification/tree/master/configs/convmixer)
 - [x] [CSPNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/cspnet)
 - [x] [PoolFormer](https://github.com/open-mmlab/mmclassification/tree/master/configs/poolformer)
+- [x] [MViT](https://github.com/open-mmlab/mmclassification/tree/master/configs/mvit)
 
 </details>
 

diff --git a/configs/_base_/datasets/imagenet_bs64_swin_256.py b/configs/_base_/datasets/imagenet_bs64_swin_256.py
@@ -0,0 +1,71 @@
+_base_ = ['./pipelines/rand_aug.py']
+
+# dataset settings
+dataset_type = 'ImageNet'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=256,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies={{_base_.rand_increasing_policies}},
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+            interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(292, -1),  # ( 256 / 224 * 256 )
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/train',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline),
+    test=dict(
+        # replace `data/val` with `data/test` for standard test
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=10, metric='accuracy')
diff --git a/configs/_base_/models/mvit/mvitv2-base.py b/configs/_base_/models/mvit/mvitv2-base.py
@@ -0,0 +1,19 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='MViT', arch='base', drop_path_rate=0.3),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=1000,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/_base_/models/mvit/mvitv2-large.py b/configs/_base_/models/mvit/mvitv2-large.py
@@ -0,0 +1,23 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='MViT',
+        arch='large',
+        drop_path_rate=0.5,
+        dim_mul_in_attention=False),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        in_channels=1152,
+        num_classes=1000,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/_base_/models/mvit/mvitv2-small.py b/configs/_base_/models/mvit/mvitv2-small.py
@@ -0,0 +1,19 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='MViT', arch='small', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=1000,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/_base_/models/mvit/mvitv2-tiny.py b/configs/_base_/models/mvit/mvitv2-tiny.py
@@ -0,0 +1,19 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='MViT', arch='tiny', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=1000,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/_base_/models/swin_transformer_v2/base_256.py b/configs/_base_/models/swin_transformer_v2/base_256.py
@@ -0,0 +1,25 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='base',
+        img_size=256,
+        drop_path_rate=0.5),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/_base_/models/swin_transformer_v2/base_384.py b/configs/_base_/models/swin_transformer_v2/base_384.py
@@ -0,0 +1,17 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='base',
+        img_size=384,
+        drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
diff --git a/configs/_base_/models/swin_transformer_v2/large_256.py b/configs/_base_/models/swin_transformer_v2/large_256.py
@@ -0,0 +1,16 @@
+# model settings
+# Only for evaluation
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='large',
+        img_size=256,
+        drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5)))
diff --git a/configs/_base_/models/swin_transformer_v2/large_384.py b/configs/_base_/models/swin_transformer_v2/large_384.py
@@ -0,0 +1,16 @@
+# model settings
+# Only for evaluation
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='large',
+        img_size=384,
+        drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5)))
diff --git a/configs/_base_/models/swin_transformer_v2/small_256.py b/configs/_base_/models/swin_transformer_v2/small_256.py
@@ -0,0 +1,25 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='small',
+        img_size=256,
+        drop_path_rate=0.3),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/_base_/models/swin_transformer_v2/tiny_256.py b/configs/_base_/models/swin_transformer_v2/tiny_256.py
@@ -0,0 +1,25 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='tiny',
+        img_size=256,
+        drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/mvit/README.md b/configs/mvit/README.md
@@ -0,0 +1,44 @@
+# MViT V2
+
+> [MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this paper, we study Multiscale Vision Transformers (MViTv2) as a unified architecture for image and video
+classification, as well as object detection. We present an improved version of MViT that incorporates
+decomposed relative positional embeddings and residual pooling connections. We instantiate this architecture
+in five sizes and evaluate it for ImageNet classification, COCO detection and Kinetics video recognition where
+it outperforms prior work. We further compare MViTv2s' pooling attention to window attention mechanisms where
+it outperforms the latter in accuracy/compute. Without bells-and-whistles, MViTv2 has state-of-the-art
+performance in 3 domains: 88.8% accuracy on ImageNet classification, 58.7 boxAP on COCO object detection as
+well as 86.1% on Kinetics-400 video classification.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/180376227-755243fa-158e-4068-940a-416036519665.png" width="50%"/>
+</div>
+
+## Results and models
+
+### ImageNet-1k
+
+|     Model      |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                                Config                                |                                Download                                 |
+| :------------: | :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------------: | :---------------------------------------------------------------------: |
+| MViTv2-tiny\*  | From scratch |   24.17   |   4.70   |   82.33   |   96.15   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-tiny_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth) |
+| MViTv2-small\* | From scratch |   34.87   |   7.00   |   83.63   |   96.51   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-small_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth) |
+| MViTv2-base\*  | From scratch |   51.47   |  10.20   |   84.34   |   96.86   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-base_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth) |
+| MViTv2-large\* | From scratch |  217.99   |  42.10   |   85.25   |   97.14   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-large_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/mvit). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+## Citation
+
+```bibtex
+@inproceedings{li2021improved,
+  title={MViTv2: Improved multiscale vision transformers for classification and detection},
+  author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph},
+  booktitle={CVPR},
+  year={2022}
+}
+```