open-mmlab · mzr1996 · Sep 27, 2022 · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022
diff --git a/configs/_base_/models/van/van_b0.py b/configs/_base_/models/van/van_b0.py
@@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b0', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=256,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/_base_/models/van/van_b1.py b/configs/_base_/models/van/van_b1.py
@@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b1', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
diff --git a/configs/_base_/models/van/van_b2.py b/configs/_base_/models/van/van_b2.py
@@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b2', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
diff --git a/configs/_base_/models/van/van_b3.py b/configs/_base_/models/van/van_b3.py
@@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b3', drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
diff --git a/configs/_base_/models/van/van_b4.py b/configs/_base_/models/van/van_b4.py
@@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b4', drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
diff --git a/configs/_base_/models/van/van_b5.py b/configs/_base_/models/van/van_b5.py
@@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b5', drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
diff --git a/configs/_base_/models/van/van_b6.py b/configs/_base_/models/van/van_b6.py
@@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b6', drop_path_rate=0.3),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
diff --git a/configs/_base_/models/van/van_base.py b/configs/_base_/models/van/van_base.py
@@ -1,13 +1 @@
-# model settings
-model = dict(
-    type='ImageClassifier',
-    backbone=dict(type='VAN', arch='base', drop_path_rate=0.1),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=512,
-        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
-        loss=dict(
-            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
-        cal_acc=False))
+_base_ = ['./van-b2.py']
diff --git a/configs/_base_/models/van/van_large.py b/configs/_base_/models/van/van_large.py
@@ -1,13 +1 @@
-# model settings
-model = dict(
-    type='ImageClassifier',
-    backbone=dict(type='VAN', arch='large', drop_path_rate=0.2),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=512,
-        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
-        loss=dict(
-            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
-        cal_acc=False))
+_base_ = ['./van-b3.py']
diff --git a/configs/_base_/models/van/van_small.py b/configs/_base_/models/van/van_small.py
@@ -1,21 +1 @@
-# model settings
-model = dict(
-    type='ImageClassifier',
-    backbone=dict(type='VAN', arch='small', drop_path_rate=0.1),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=512,
-        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
-        loss=dict(
-            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
-        cal_acc=False),
-    init_cfg=[
-        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
-        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
-    ],
-    train_cfg=dict(augments=[
-        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
-        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
-    ]))
+_base_ = ['./van-b1.py']
diff --git a/configs/_base_/models/van/van_tiny.py b/configs/_base_/models/van/van_tiny.py
@@ -1,21 +1 @@
-# model settings
-model = dict(
-    type='ImageClassifier',
-    backbone=dict(type='VAN', arch='tiny', drop_path_rate=0.1),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=256,
-        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
-        loss=dict(
-            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
-        cal_acc=False),
-    init_cfg=[
-        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
-        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
-    ],
-    train_cfg=dict(augments=[
-        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
-        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
-    ]))
+_base_ = ['./van-b0.py']
diff --git a/configs/van/README.md b/configs/van/README.md
@@ -16,15 +16,28 @@ While originally designed for natural language processing (NLP) tasks, the self-
 
 ### ImageNet-1k
 
-|  Model  |   Pretrain   | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                               Config                                |                               Download                                |
-| :-----: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------: | :-------------------------------------------------------------------: |
-| VAN-T\* | From scratch |  224x224   |   4.11    |   0.88   |   75.41   |   93.02   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-tiny_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth) |
-| VAN-S\* | From scratch |  224x224   |   13.86   |   2.52   |   81.01   |   95.63   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth) |
-| VAN-B\* | From scratch |  224x224   |   26.58   |   5.03   |   82.80   |   96.21   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth) |
-| VAN-L\* | From scratch |  224x224   |   44.77   |   8.99   |   83.86   |   96.73   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth) |
+|  Model   |   Pretrain   | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                               Config                               |                               Download                                |
+| :------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------: | :-------------------------------------------------------------------: |
+| VAN-B0\* | From scratch |  224x224   |   4.11    |   0.88   |   75.41   |   93.02   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b0_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth) |
+| VAN-B1\* | From scratch |  224x224   |   13.86   |   2.52   |   81.01   |   95.63   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b1_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth) |
+| VAN-B2\* | From scratch |  224x224   |   26.58   |   5.03   |   82.80   |   96.21   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b2_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth) |
+| VAN-B3\* | From scratch |  224x224   |   44.77   |   8.99   |   83.86   |   96.73   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b3_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth) |
+| VAN-B4\* | From scratch |  224x224   |   60.28   |  12.22   |   84.13   |   96.86   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b4_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-b4_3rdparty_in1k_20220909-f4665b92.pth) |
 
 \*Models with * are converted from [the official repo](https://github.com/Visual-Attention-Network/VAN-Classification). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.
 
+### Pre-trained Models
+
+The pre-trained models on ImageNet-21k are used to fine-tune on the downstream tasks.
+
+|  Model   |   Pretrain   | resolution | Params(M) | Flops(G) |                                                  Download                                                   |
+| :------: | :----------: | :--------: | :-------: | :------: | :---------------------------------------------------------------------------------------------------------: |
+| VAN-B4\* | ImageNet-21k |  224x224   |   60.28   |  12.22   | [model](https://download.openmmlab.com/mmclassification/v0/van/van-b4_3rdparty_in21k_20220909-db926b18.pth) |
+| VAN-B5\* | ImageNet-21k |  224x224   |   89.97   |  17.21   | [model](https://download.openmmlab.com/mmclassification/v0/van/van-b5_3rdparty_in21k_20220909-18e904e3.pth) |
+| VAN-B6\* | ImageNet-21k |  224x224   |   283.9   |  55.28   | [model](https://download.openmmlab.com/mmclassification/v0/van/van-b6_3rdparty_in21k_20220909-96c2cb3a.pth) |
+
+\*Models with * are converted from [the official repo](https://github.com/Visual-Attention-Network/VAN-Classification).
+
 ## Citation
 
 ```

diff --git a/configs/van/metafile.yml b/configs/van/metafile.yml
@@ -7,6 +7,7 @@ Collections:
         - Weight Decay
       Architecture:
         - Visual Attention Network
+        - LKA
     Paper:
       URL: https://arxiv.org/pdf/2202.09741v2.pdf
       Title: "Visual Attention Network"
@@ -16,10 +17,10 @@ Collections:
       Version: v0.23.0
 
 Models:
-  - Name: van-tiny_8xb128_in1k
+  - Name: van-b0_3rdparty_in1k
     Metadata:
-      FLOPs: 4110000      # 4.11M
-      Parameters: 880000000   # 0.88G
+      FLOPs: 880000000   # 0.88G
+      Parameters: 4110000      # 4.11M
     In Collection: Visual-Attention-Network
     Results:
       - Dataset: ImageNet-1k
@@ -28,11 +29,11 @@ Models:
           Top 5 Accuracy: 93.02
         Task: Image Classification
     Weights: https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth
-    Config: configs/van/van-tiny_8xb128_in1k.py
-  - Name: van-small_8xb128_in1k
+    Config: configs/van/van-b0_8xb128_in1k.py
+  - Name: van-b1_3rdparty_in1k
     Metadata:
-      FLOPs:  13860000          # 13.86M
-      Parameters: 2520000000    # 2.52G
+      FLOPs:  2520000000    # 2.52G
+      Parameters: 13860000          # 13.86M
     In Collection: Visual-Attention-Network
     Results:
         - Dataset: ImageNet-1k
@@ -41,11 +42,11 @@ Models:
             Top 5 Accuracy: 95.63
           Task: Image Classification
     Weights: https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth
-    Config: configs/van/van-small_8xb128_in1k.py
-  - Name: van-base_8xb128_in1k
+    Config: configs/van/van-b1_8xb128_in1k.py
+  - Name: van-b2_3rdparty_in1k
     Metadata:
-      FLOPs: 26580000            # 26.58M
-      Parameters: 5030000000                # 5.03G
+      FLOPs: 5030000000                # 5.03G
+      Parameters: 26580000            # 26.58M
     In Collection: Visual-Attention-Network
     Results:
         - Dataset: ImageNet-1k
@@ -54,11 +55,11 @@ Models:
             Top 5 Accuracy: 96.21
           Task: Image Classification
     Weights: https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth
-    Config: configs/van/van-base_8xb128_in1k.py
-  - Name: van-large_8xb128_in1k
+    Config: configs/van/van-b2_8xb128_in1k.py
+  - Name: van-b3_3rdparty_in1k
     Metadata:
-      FLOPs: 44770000              # 44.77 M
-      Parameters: 8990000000              # 8.99G
+      FLOPs: 8990000000              # 8.99G
+      Parameters: 44770000              # 44.77M
     In Collection: Visual-Attention-Network
     Results:
         - Dataset: ImageNet-1k
@@ -67,4 +68,17 @@ Models:
             Top 5 Accuracy: 96.73
           Task: Image Classification
     Weights: https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth
-    Config: configs/van/van-large_8xb128_in1k.py
+    Config: configs/van/van-b3_8xb128_in1k.py
+  - Name: van-b4_3rdparty_in1k
+    Metadata:
+      FLOPs: 12220000000              # 12.22G
+      Parameters: 60280000              # 60.28M
+    In Collection: Visual-Attention-Network
+    Results:
+        - Dataset: ImageNet-1k
+          Metrics:
+            Top 1 Accuracy: 84.13
+            Top 5 Accuracy: 96.86
+          Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/van/van-b4_3rdparty_in1k_20220909-f4665b92.pth
+    Config: configs/van/van-b4_8xb128_in1k.py
diff --git a/configs/van/van-b0_8xb128_in1k.py b/configs/van/van-b0_8xb128_in1k.py
@@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/models/van/van_b0.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# Note that the mean and variance used here are different from other configs
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies={{_base_.rand_increasing_policies}},
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+            interpolation='bicubic')),
+    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(248, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=128,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))