From 5e3f490050873ee4f947c74a9663fbc1857d19b1 Mon Sep 17 00:00:00 2001
From: Abhishek Chaurasia <abhishek.chaurasia29@hotmail.com>
Date: Tue, 27 Sep 2022 18:02:16 -0700
Subject: [PATCH] MobileViTv3 first commit

---
 .gitignore                                    |   24 +
 LICENSE                                       |   71 ++
 .../mobilevitv3_small_multiserver.yaml        |   91 ++
 .../mobilevitv3_small_oneserver.yaml          |   90 ++
 .../mobilevitv3_x_small_multiserver.yaml      |   91 ++
 .../mobilevitv3_x_small_oneserver.yaml        |   91 ++
 .../mobilevitv3_xx_small_oneserver.yaml       |   90 ++
 .../detection/ssd_mobilevitv3_small_320.yaml  |   93 ++
 .../ssd_mobilevitv3_x_small_320.yaml          |   93 ++
 .../ssd_mobilevitv3_xx_small_320.yaml         |   93 ++
 .../deeplabv3_mobilevitv3_small.yaml          |  109 ++
 .../deeplabv3_mobilevitv3_x_small.yaml        |  109 ++
 .../deeplabv3_mobilevitv3_xx_small.yaml       |  109 ++
 .../models/classification/config/mobilevit.py |  553 ++++++++
 .../cvnets/models/classification/mobilevit.py |  223 ++++
 MobileViTv3-v1/cvnets/modules/__init__.py     |   29 +
 .../cvnets/modules/mobilevit_block.py         |  238 ++++
 MobileViTv3-v1/environment_cvnet.yml          |  153 +++
 MobileViTv3-v1/main_latency.py                |  183 +++
 MobileViTv3-v1/setup.py                       |   77 ++
 .../mobilevit_v3.yaml                         |   80 ++
 .../imagenet/mobilevit_v3_multiserver.py      |  103 ++
 .../imagenet/mobilevit_v3_oneserver.py        |   97 ++
 .../detection/ssd_coco/mobilevit_v3.yaml      |   95 ++
 .../ade20k/deeplabv3_mobilevitv3.yaml         |  108 ++
 .../ade20k/pspnet_mobilevitv3.yaml            |  108 ++
 .../pascal_voc/deeplabv3_mobilevitv3.yaml     |  114 ++
 .../pascal_voc/pspnet_mobilevitv3.yaml        |  114 ++
 .../classification/config/mobilevit_v3.py     |   76 ++
 .../models/classification/mobilevit_v3.py     |  226 ++++
 MobileViTv3-v2/cvnets/modules/__init__.py     |   30 +
 .../cvnets/modules/mobilevit_block.py         | 1107 +++++++++++++++++
 MobileViTv3-v2/environment_mbvt2.yml          |  172 +++
 README.md                                     |   73 ++
 34 files changed, 5113 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 MobileViTv3-v1/config/classification/mobilevitv3_small_multiserver.yaml
 create mode 100644 MobileViTv3-v1/config/classification/mobilevitv3_small_oneserver.yaml
 create mode 100644 MobileViTv3-v1/config/classification/mobilevitv3_x_small_multiserver.yaml
 create mode 100644 MobileViTv3-v1/config/classification/mobilevitv3_x_small_oneserver.yaml
 create mode 100644 MobileViTv3-v1/config/classification/mobilevitv3_xx_small_oneserver.yaml
 create mode 100644 MobileViTv3-v1/config/detection/ssd_mobilevitv3_small_320.yaml
 create mode 100644 MobileViTv3-v1/config/detection/ssd_mobilevitv3_x_small_320.yaml
 create mode 100644 MobileViTv3-v1/config/detection/ssd_mobilevitv3_xx_small_320.yaml
 create mode 100644 MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_small.yaml
 create mode 100644 MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_x_small.yaml
 create mode 100644 MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_xx_small.yaml
 create mode 100644 MobileViTv3-v1/cvnets/models/classification/config/mobilevit.py
 create mode 100644 MobileViTv3-v1/cvnets/models/classification/mobilevit.py
 create mode 100644 MobileViTv3-v1/cvnets/modules/__init__.py
 create mode 100644 MobileViTv3-v1/cvnets/modules/mobilevit_block.py
 create mode 100644 MobileViTv3-v1/environment_cvnet.yml
 create mode 100644 MobileViTv3-v1/main_latency.py
 create mode 100644 MobileViTv3-v1/setup.py
 create mode 100644 MobileViTv3-v2/config/classification/finetune_higher_res_in1k/mobilevit_v3.yaml
 create mode 100644 MobileViTv3-v2/config/classification/imagenet/mobilevit_v3_multiserver.py
 create mode 100644 MobileViTv3-v2/config/classification/imagenet/mobilevit_v3_oneserver.py
 create mode 100644 MobileViTv3-v2/config/detection/ssd_coco/mobilevit_v3.yaml
 create mode 100644 MobileViTv3-v2/config/segmentation/ade20k/deeplabv3_mobilevitv3.yaml
 create mode 100644 MobileViTv3-v2/config/segmentation/ade20k/pspnet_mobilevitv3.yaml
 create mode 100644 MobileViTv3-v2/config/segmentation/pascal_voc/deeplabv3_mobilevitv3.yaml
 create mode 100644 MobileViTv3-v2/config/segmentation/pascal_voc/pspnet_mobilevitv3.yaml
 create mode 100644 MobileViTv3-v2/cvnets/models/classification/config/mobilevit_v3.py
 create mode 100644 MobileViTv3-v2/cvnets/models/classification/mobilevit_v3.py
 create mode 100644 MobileViTv3-v2/cvnets/modules/__init__.py
 create mode 100644 MobileViTv3-v2/cvnets/modules/mobilevit_block.py
 create mode 100644 MobileViTv3-v2/environment_mbvt2.yml
 create mode 100644 README.md

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..81a58d5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,24 @@
+*.pyc
+__pycache__
+.DS_STORE
+.idea
+results*
+*.png
+*.jpg
+.idea
+*.pt
+*.pth
+
+results*
+vision_datasets/
+exp_results/
+exp_results*
+results_*
+coco_eval_results/
+
+*.so
+model_zoo
+model_zoo/*
+
+cvnets.egg-info
+cvnets.egg-info/*
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c0977c2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,71 @@
+MICRON TECHNOLOGY, INC. SOFTWARE LICENSE AGREEMENT
+
+PLEASE READ THIS LICENSE AGREEMENT ("AGREEMENT") FROM MICRON TECHNOLOGY, INC.
+("MTI") CAREFULLY: BY INSTALLING, USING, OR MODIFYING THE MTI SOFTWARE AND ANY
+RELATED PRINTED MATERIALS ("SOFTWARE"), YOU ARE ACCEPTING AND AGREEING TO THE
+TERMS OF THIS AGREEMENT. IF YOU DO NOT AGREE WITH THE TERMS OF THIS AGREEMENT, DO
+NOT INSTALL, USE, OR MODIFY THE SOFTWARE.
+
+LICENSE:
+
+In consideration of your agreement to abide by the terms of this Agreement,
+and subject to these terms, MTI hereby grants to you a personal, non-exclusive
+license, under MTI’s copyrights in the Software, to install, use, and copy for
+personal use the Software solely for non-commercial uses and purposes subject to
+the terms of this Agreement. You must maintain all copyright notices on all
+copies of the Software. You agree not to use the Software for any commercial
+purpose or for any public display (commercial or noncommercial). MTI may make
+changes to the Software at any time without notice to you. In addition, MTI is
+under no obligation whatsoever to update, maintain, or provide new versions or
+other support for the Software. This license shall automatically terminate if you
+violate any of the terms of this license and may be terminated by MTI at any
+time and for any reason without notice. Upon termination of this license, you
+must destroy any Software in your possession whether in electronic or printed
+format. In any unmodified version of this Software, you must retain this notice
+and the following text and disclaimers. Neither the name, trademarks, service
+marks or logos of MTI may be used to endorse or promote products derived from the
+Software without specific prior written permission from MTI. Except as expressly
+stated in this notice, no other rights or licenses, express or implied, are
+granted by MTI herein, including but not limited to any patent rights that may be
+infringed by your derivative works or by other works in which the Software may be
+incorporated.
+
+OWNERSHIP OF MATERIALS:
+
+You acknowledge and agree that the Software and any derivative works thereof
+are proprietary property of MTI (and/or its affiliated companies) and protected
+by United States copyright law and international treaty provisions. The Software
+may also be the subject of pending patent applications or granted patents. MTI
+does not grant any express or implied rights hereunder to any patents, copyrights,
+or trademarks for any commercial uses or purposes. You further acknowledge and
+agree that all right, title, and interest in and to the Software, including
+associated proprietary rights, are and shall remain with MTI (and/or its
+affiliated companies). This Agreement does not convey to you an interest in or to
+the Software, but only a limited right to install and use the Software in
+accordance with the terms of this Agreement. The Software is licensed to you and
+not sold.
+
+DISCLAIMER OF WARRANTY:
+
+THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. MTI, ON BEHALF OF
+ITSELF AND ITS AFFILIATED COMPANIES, EXPRESSLY DISCLAIMS ALL WARRANTIES EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO, NONINFRINGEMENT OF THIRD-PARTY RIGHTS,
+AND ANY IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR
+PURPOSE. MTI DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS, OR
+THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE.
+FURTHERMORE, MTI DOES NOT MAKE ANY REPRESENTATIONS REGARDING THE USE OR THE
+RESULTS OF THE USE OF THE SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY,
+RELIABILITY, OR OTHERWISE. THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF
+THE SOFTWARE REMAINS WITH YOU. IN NO EVENT SHALL MTI OR ITS AFFILIATED COMPANIES
+BE LIABLE FOR ANY DIRECT, INDIRECT, CONSEQUENTIAL, INCIDENTAL, OR SPECIAL DAMAGES
+(INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS, BUSINESS
+INTERRUPTION, OR LOSS OF INFORMATION) ARISING OUT OF YOUR USE OF OR INABILITY TO
+USE THE SOFTWARE, EVEN IF MTI HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+Because some jurisdictions prohibit the exclusion or limitation of liability for
+consequential or incidental damages, the above limitation may not apply to you.
+This Agreement constitutes the entire agreement between MTI and you regarding the
+subject matter hereof and supersedes all previous oral or written communications
+between the parties. This Agreement shall be governed by the laws of the State of
+Delaware without regard to its conflict of laws rules. By proceeding with the
+installation of the Software, you agree to the terms of this Agreement. You must
+agree to the terms in order to use the Software.
diff --git a/MobileViTv3-v1/config/classification/mobilevitv3_small_multiserver.yaml b/MobileViTv3-v1/config/classification/mobilevitv3_small_multiserver.yaml
new file mode 100644
index 0000000..0fb5c06
--- /dev/null
+++ b/MobileViTv3-v1/config/classification/mobilevitv3_small_multiserver.yaml
@@ -0,0 +1,91 @@
+common:
+  run_label: "run_1"
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "Datasets/ILSVRC2012-raw/train"
+  root_val: "Datasets/ILSVRC2012-raw/val"
+  name: "imagenet"
+  category: "classification"
+  train_batch_size0: 32
+  val_batch_size0: 32
+  eval_batch_size0: 1
+  workers: 6
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  random_resized_crop:
+    enable: true
+    interpolation: "bilinear"
+  random_horizontal_flip:
+    enable: true
+sampler:
+  name: "variable_batch_sampler"
+  vbs:
+    crop_size_width: 256
+    crop_size_height: 256
+    max_n_scales: 5
+    min_crop_size_width: 160
+    max_crop_size_width: 320
+    min_crop_size_height: 160
+    max_crop_size_height: 320
+    check_scale: 32
+loss:
+  category: "classification"
+  classification:
+    name: "label_smoothing"
+    label_smoothing_factor: 0.1
+optim:
+  name: "adamw"
+  weight_decay: 0.01
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 300
+  warmup_iterations: 3000
+  warmup_init_lr: 0.0002
+  cosine:
+    max_lr: 0.002
+    min_lr: 0.0002
+model:
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "small_v3" #"small_v3_fast"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.1
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+  normalization:
+    name: "batch_norm_2d"
+    momentum: 0.1
+  activation:
+    name: "swish"
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "trunc_normal"
+    linear_init_std_dev: 0.02
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  dist_url: "tcp://XX.XXX.XXX.XXX:XXXX"  # ip address of server with rank 0
+  rank: 0  # rank value unique for each server
+  world_size: 6
+  dist_port: 30786
+stats:
+  name: [ "loss", "top1", "top5" ]
+  checkpoint_metric: "top1"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v1/config/classification/mobilevitv3_small_oneserver.yaml b/MobileViTv3-v1/config/classification/mobilevitv3_small_oneserver.yaml
new file mode 100644
index 0000000..4b3fd4e
--- /dev/null
+++ b/MobileViTv3-v1/config/classification/mobilevitv3_small_oneserver.yaml
@@ -0,0 +1,90 @@
+common:
+  run_label: "run_1"
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "Datasets/ILSVRC2012-raw/train"
+  root_val: "Datasets/ILSVRC2012-raw/val"
+  name: "imagenet"
+  category: "classification"
+  train_batch_size0: 32
+  val_batch_size0: 32
+  eval_batch_size0: 1
+  workers: 6
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  random_resized_crop:
+    enable: true
+    interpolation: "bilinear"
+  random_horizontal_flip:
+    enable: true
+sampler:
+  name: "variable_batch_sampler"
+  vbs:
+    crop_size_width: 256
+    crop_size_height: 256
+    max_n_scales: 5
+    min_crop_size_width: 160
+    max_crop_size_width: 320
+    min_crop_size_height: 160
+    max_crop_size_height: 320
+    check_scale: 32
+loss:
+  category: "classification"
+  classification:
+    name: "label_smoothing"
+    label_smoothing_factor: 0.1
+optim:
+  name: "adamw"
+  weight_decay: 0.01
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 300
+  warmup_iterations: 3000
+  warmup_init_lr: 0.0002
+  cosine:
+    max_lr: 0.002
+    min_lr: 0.0002
+model:
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "small_v3"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.1
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+  normalization:
+    name: "batch_norm_2d"
+    momentum: 0.1
+  activation:
+    name: "swish"
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "trunc_normal"
+    linear_init_std_dev: 0.02
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  rank: 0
+  world_size: -1
+  dist_port: 30786
+stats:
+  name: [ "loss", "top1", "top5" ]
+  checkpoint_metric: "top1"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v1/config/classification/mobilevitv3_x_small_multiserver.yaml b/MobileViTv3-v1/config/classification/mobilevitv3_x_small_multiserver.yaml
new file mode 100644
index 0000000..85fbc26
--- /dev/null
+++ b/MobileViTv3-v1/config/classification/mobilevitv3_x_small_multiserver.yaml
@@ -0,0 +1,91 @@
+common:
+  run_label: "run_1"
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/media/Datasets/ILSVRC2012-raw/train"
+  root_val: "/media/Datasets/ILSVRC2012-raw/val"
+  name: "imagenet"
+  category: "classification"
+  train_batch_size0: 32
+  val_batch_size0: 32
+  eval_batch_size0: 1
+  workers: 6
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  random_resized_crop:
+    enable: true
+    interpolation: "bilinear"
+  random_horizontal_flip:
+    enable: true
+sampler:
+  name: "variable_batch_sampler"
+  vbs:
+    crop_size_width: 256
+    crop_size_height: 256
+    max_n_scales: 5
+    min_crop_size_width: 160
+    max_crop_size_width: 320
+    min_crop_size_height: 160
+    max_crop_size_height: 320
+    check_scale: 32
+loss:
+  category: "classification"
+  classification:
+    name: "label_smoothing"
+    label_smoothing_factor: 0.1
+optim:
+  name: "adamw"
+  weight_decay: 1.e-2
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 300
+  warmup_iterations: 3000
+  warmup_init_lr: 0.0002
+  cosine:
+    max_lr: 0.002
+    min_lr: 0.0002
+model:
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "x_small_v3"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.1
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+  normalization:
+    name: "batch_norm_2d"
+    momentum: 0.1
+  activation:
+    name: "swish"
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "trunc_normal"
+    linear_init_std_dev: 0.02
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  dist_url: "tcp://XX.XXX.XXX.XXX:XXXX" # ip address of server with rank 0
+  rank: 0 # rank value unique for each server
+  world_size: 6
+  dist_port: 30786
+stats:
+  name: [ "loss", "top1", "top5" ]
+  checkpoint_metric: "top1"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v1/config/classification/mobilevitv3_x_small_oneserver.yaml b/MobileViTv3-v1/config/classification/mobilevitv3_x_small_oneserver.yaml
new file mode 100644
index 0000000..dc1bb2d
--- /dev/null
+++ b/MobileViTv3-v1/config/classification/mobilevitv3_x_small_oneserver.yaml
@@ -0,0 +1,91 @@
+common:
+  run_label: "run_1"
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/media/Datasets/ILSVRC2012-raw/train"
+  root_val: "/media/Datasets/ILSVRC2012-raw/val"
+  name: "imagenet"
+  category: "classification"
+  train_batch_size0: 32
+  val_batch_size0: 32
+  eval_batch_size0: 1
+  workers: 6
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  random_resized_crop:
+    enable: true
+    interpolation: "bilinear"
+  random_horizontal_flip:
+    enable: true
+sampler:
+  name: "variable_batch_sampler"
+  vbs:
+    crop_size_width: 256
+    crop_size_height: 256
+    max_n_scales: 5
+    min_crop_size_width: 160
+    max_crop_size_width: 320
+    min_crop_size_height: 160
+    max_crop_size_height: 320
+    check_scale: 32
+loss:
+  category: "classification"
+  classification:
+    name: "label_smoothing"
+    label_smoothing_factor: 0.1
+optim:
+  name: "adamw"
+  weight_decay: 1.e-2
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 300
+  warmup_iterations: 3000
+  warmup_init_lr: 0.0002
+  cosine:
+    max_lr: 0.002
+    min_lr: 0.0002
+model:
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "x_small_v3"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.1
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+  normalization:
+    name: "batch_norm_2d"
+    momentum: 0.1
+  activation:
+    name: "swish"
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "trunc_normal"
+    linear_init_std_dev: 0.02
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: false  # false for single server
+  dist_url: "tcp://XX.XXX.XXX.XXX:XXXX" # ip address of server with rank 0
+  rank: 0 # rank value unique for each server
+  world_size: 6
+  dist_port: 30786
+stats:
+  name: [ "loss", "top1", "top5" ]
+  checkpoint_metric: "top1"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v1/config/classification/mobilevitv3_xx_small_oneserver.yaml b/MobileViTv3-v1/config/classification/mobilevitv3_xx_small_oneserver.yaml
new file mode 100644
index 0000000..77d6945
--- /dev/null
+++ b/MobileViTv3-v1/config/classification/mobilevitv3_xx_small_oneserver.yaml
@@ -0,0 +1,90 @@
+common:
+  run_label: "run_1"
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/media/Datasets/ILSVRC2012-raw/train"
+  root_val: "/media/Datasets/ILSVRC2012-raw/val"
+  name: "imagenet"
+  category: "classification"
+  train_batch_size0: 64
+  val_batch_size0: 64
+  eval_batch_size0: 1
+  workers: 6
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  random_resized_crop:
+    enable: true
+    interpolation: "bilinear"
+  random_horizontal_flip:
+    enable: true
+sampler:
+  name: "variable_batch_sampler"
+  vbs:
+    crop_size_width: 256
+    crop_size_height: 256
+    max_n_scales: 5
+    min_crop_size_width: 160
+    max_crop_size_width: 320
+    min_crop_size_height: 160
+    max_crop_size_height: 320
+    check_scale: 32
+loss:
+  category: "classification"
+  classification:
+    name: "label_smoothing"
+    label_smoothing_factor: 0.1
+optim:
+  name: "adamw"
+  weight_decay: 1.e-2
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 300
+  warmup_iterations: 3000
+  warmup_init_lr: 0.0002
+  cosine:
+    max_lr: 0.002
+    min_lr: 0.0002
+model:
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "xx_small_v3"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.05
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+  normalization:
+    name: "batch_norm_2d"
+    momentum: 0.1
+  activation:
+    name: "swish"
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "trunc_normal"
+    linear_init_std_dev: 0.02
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  rank: 0
+  world_size: -1
+  dist_port: 30786
+stats:
+  name: [ "loss", "top1", "top5" ]
+  checkpoint_metric: "top1"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v1/config/detection/ssd_mobilevitv3_small_320.yaml b/MobileViTv3-v1/config/detection/ssd_mobilevitv3_small_320.yaml
new file mode 100644
index 0000000..7065ba0
--- /dev/null
+++ b/MobileViTv3-v1/config/detection/ssd_mobilevitv3_small_320.yaml
@@ -0,0 +1,93 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/datasets/coco_2017"
+  root_val: "/datasets/coco_2017"
+  name: "coco_ssd"
+  category: "detection"
+  train_batch_size0: 32
+  val_batch_size0: 32
+  eval_batch_size0: 1
+  workers: 8 # training using 4 gpus
+  persistent_workers: false
+  pin_memory: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 320
+    crop_size_height: 320
+loss:
+  category: "detection"
+  detection:
+    name: "ssd_multibox_loss"
+    ssd_multibox_loss:
+      neg_pos_ratio: 3
+optim:
+  name: "adamw"
+  weight_decay: 0.01
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 200
+  warmup_iterations: 500
+  warmup_init_lr: 0.00009
+  cosine:
+    max_lr: 0.004 # [2.7e-3 * N_GPUS^2 x (BATCH_SIZE_GPU0/ 32) * 0.02 ] # 0.02 comes from this fact 0.1 (ResNet SGD LR)/0.002 (MIT ADAMW LR)
+    min_lr: 1.e-6
+model:
+  detection:
+    name: "ssd"
+    ssd:
+      anchors_aspect_ratio: [ [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [2] ]
+      output_strides: [ 16, 32, 64, 128, 256, -1 ]
+      proj_channels: [512, 256, 256, 128, 128, 64]
+      center_variance: 0.1
+      size_variance: 0.2
+      iou_threshold: 0.5
+      nms_iou_threshold: 0.5
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "small_v3"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.1
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+    pretrained: "results/mobilevitv3_small_e300_7930/run_1/checkpoint_ema_best.pt"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "relu" # If specific activation function is not specified, this one will be used as a default
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "normal"
+    conv_weight_std: false
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  rank: 0
+  world_size: -1
+  dist_port: 30786
+stats:
+  name: [ "loss" ]
+  checkpoint_metric: "loss"
+  checkpoint_metric_max: false
diff --git a/MobileViTv3-v1/config/detection/ssd_mobilevitv3_x_small_320.yaml b/MobileViTv3-v1/config/detection/ssd_mobilevitv3_x_small_320.yaml
new file mode 100644
index 0000000..1b8e3eb
--- /dev/null
+++ b/MobileViTv3-v1/config/detection/ssd_mobilevitv3_x_small_320.yaml
@@ -0,0 +1,93 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/datasets/coco_2017"
+  root_val: "/datasets/coco_2017"
+  name: "coco_ssd"
+  category: "detection"
+  train_batch_size0: 32
+  val_batch_size0: 32
+  eval_batch_size0: 1
+  workers: 12
+  persistent_workers: false
+  pin_memory: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 320
+    crop_size_height: 320
+loss:
+  category: "detection"
+  detection:
+    name: "ssd_multibox_loss"
+    ssd_multibox_loss:
+      neg_pos_ratio: 3
+optim:
+  name: "adamw"
+  weight_decay: 0.01
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 200
+  warmup_iterations: 500
+  warmup_init_lr: 0.00009
+  cosine:
+    max_lr: 0.0009 # [2.7e-3 * N_GPUS^2 x (BATCH_SIZE_GPU0/ 32) * 0.02 ] # 0.02 comes from this fact 0.1 (ResNet SGD LR)/0.002 (MIT ADAMW LR)
+    min_lr: 1.e-6
+model:
+  detection:
+    name: "ssd"
+    ssd:
+      anchors_aspect_ratio: [ [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [2] ]
+      output_strides: [ 16, 32, 64, 128, 256, -1 ]
+      proj_channels: [512, 256, 256, 128, 128, 64]
+      center_variance: 0.1
+      size_variance: 0.2
+      iou_threshold: 0.5
+      nms_iou_threshold: 0.5
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "x_small_v3" 
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.1
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+    pretrained: "results/mobilevitv3_x_small_e300_7671/run_1/checkpoint_ema_best.pt"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "relu" # If specific activation function is not specified, this one will be used as a default
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "normal"
+    conv_weight_std: false
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  rank: 0
+  world_size: -1
+  dist_port: 30786
+stats:
+  name: [ "loss" ]
+  checkpoint_metric: "loss"
+  checkpoint_metric_max: false
diff --git a/MobileViTv3-v1/config/detection/ssd_mobilevitv3_xx_small_320.yaml b/MobileViTv3-v1/config/detection/ssd_mobilevitv3_xx_small_320.yaml
new file mode 100644
index 0000000..202cd00
--- /dev/null
+++ b/MobileViTv3-v1/config/detection/ssd_mobilevitv3_xx_small_320.yaml
@@ -0,0 +1,93 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/datasets/coco_2017"
+  root_val: "/datasets/coco_2017"
+  name: "coco_ssd"
+  category: "detection"
+  train_batch_size0: 64
+  val_batch_size0: 64
+  eval_batch_size0: 1
+  workers: 12
+  persistent_workers: false
+  pin_memory: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 320
+    crop_size_height: 320
+loss:
+  category: "detection"
+  detection:
+    name: "ssd_multibox_loss"
+    ssd_multibox_loss:
+      neg_pos_ratio: 3
+optim:
+  name: "adamw"
+  weight_decay: 0.01
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 200
+  warmup_iterations: 500
+  warmup_init_lr: 0.00009
+  cosine:
+    max_lr: 0.0009 # [2.7e-3 * N_GPUS^2 x (BATCH_SIZE_GPU0/ 32) * 0.02 ] # 0.02 comes from this fact 0.1 (ResNet SGD LR)/0.002 (MIT ADAMW LR)
+    min_lr: 1.e-6
+model:
+  detection:
+    name: "ssd"
+    ssd:
+      anchors_aspect_ratio: [ [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [2] ]
+      output_strides: [ 16, 32, 64, 128, 256, -1 ]
+      proj_channels: [512, 256, 256, 128, 128, 64]
+      center_variance: 0.1
+      size_variance: 0.2
+      iou_threshold: 0.5
+      nms_iou_threshold: 0.5
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "xx_small_v3"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.05
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+    pretrained: "results/mobilevitv3_xx_small_e300_7098/run_1/checkpoint_ema_best.pt"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "relu" # If specific activation function is not specified, this one will be used as a default
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "normal"
+    conv_weight_std: false
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  rank: 0
+  world_size: -1
+  dist_port: 30786
+stats:
+  name: [ "loss" ]
+  checkpoint_metric: "loss"
+  checkpoint_metric_max: false
diff --git a/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_small.yaml b/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_small.yaml
new file mode 100644
index 0000000..39e59fa
--- /dev/null
+++ b/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_small.yaml
@@ -0,0 +1,109 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 200
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/media/Datasets/VOCdevkit"
+  root_val: "/media/Datasets/VOCdevkit"
+  name: "pascal"
+  category: "segmentation"
+  train_batch_size0: 12
+  val_batch_size0: 12
+  eval_batch_size0: 1
+  workers: 12
+  persistent_workers: false
+  pin_memory: false
+  pascal:
+    use_coco_data: true
+    coco_root_dir: "/media/Datasets/coco_preprocess"
+image_augmentation:
+  random_resize:
+    enable: true
+    min_size: 256
+    max_size: 1024
+  random_crop:
+    enable: true
+    mask_fill: 255
+    resize_if_needed: true
+  random_horizontal_flip:
+    enable: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 512
+    crop_size_height: 512
+loss:
+  category: "segmentation"
+  ignore_idx: 255
+  segmentation:
+    name: "cross_entropy"
+optim:
+  name: "adamw"
+  weight_decay: 0.01
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 50
+  warmup_iterations: 500
+  warmup_init_lr: 0.00009
+  cosine:
+    max_lr: 0.0009 # [2.7e-3 * N_GPUS^2 x (BATCH_SIZE_GPU0/ 32) * 0.02 ] # 0.02 comes from this fact 0.1 (ResNet SGD LR)/0.002 (MIT ADAMW LR)
+    min_lr: 1.e-6
+model:
+  segmentation:
+    name: "encoder_decoder"
+    lr_multiplier: 10
+    seg_head: "deeplabv3"
+    output_stride: 16
+    classifier_dropout: 0.1
+    activation:
+      name: "relu"
+    deeplabv3:
+      aspp_dropout: 0.1
+      aspp_sep_conv: false
+      aspp_out_channels: 256
+      aspp_rates: [6, 12, 18]
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "small_v3"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.1
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+    pretrained: "results/mobilevitv3_small_e300_7930/run_1/checkpoint_ema_best.pt"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "relu"
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "normal"
+    conv_weight_std: false
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  rank: 0
+  world_size: -1
+  dist_port: 30786
+stats:
+  name: [ "loss", "iou"]
+  checkpoint_metric: "iou"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_x_small.yaml b/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_x_small.yaml
new file mode 100644
index 0000000..08ef6a7
--- /dev/null
+++ b/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_x_small.yaml
@@ -0,0 +1,109 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 200
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/media/Datasets/VOCdevkit/"
+  root_val: "/media/Datasets/VOCdevkit/"
+  name: "pascal"
+  category: "segmentation"
+  train_batch_size0: 12 
+  val_batch_size0: 12
+  eval_batch_size0: 1
+  workers: 12
+  persistent_workers: false
+  pin_memory: false
+  pascal:
+    use_coco_data: true
+    coco_root_dir: "/media/Datasets/coco_preprocess"
+image_augmentation:
+  random_resize:
+    enable: true
+    min_size: 256
+    max_size: 1024
+  random_crop:
+    enable: true
+    mask_fill: 255
+    resize_if_needed: true
+  random_horizontal_flip:
+    enable: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 512
+    crop_size_height: 512
+loss:
+  category: "segmentation"
+  ignore_idx: 255
+  segmentation:
+    name: "cross_entropy"
+optim:
+  name: "adamw"
+  weight_decay: 0.01
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 50
+  warmup_iterations: 500
+  warmup_init_lr: 0.00009
+  cosine:
+    max_lr: 0.0009 # [2.7e-3 * N_GPUS^2 x (BATCH_SIZE_GPU0/ 32) * 0.02 ] # 0.02 comes from this fact 0.1 (ResNet SGD LR)/0.002 (MIT ADAMW LR)
+    min_lr: 1.e-6
+model:
+  segmentation:
+    name: "encoder_decoder"
+    lr_multiplier: 10
+    seg_head: "deeplabv3"
+    output_stride: 16
+    classifier_dropout: 0.1
+    activation:
+      name: "relu"
+    deeplabv3:
+      aspp_dropout: 0.1
+      aspp_sep_conv: false
+      aspp_out_channels: 256
+      aspp_rates: [6, 12, 18]
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "x_small_v3"
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.1
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+    pretrained: "results/mobilevitv3_x_small_e300_7671/run_1/checkpoint_ema_best.pt"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "relu"
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "normal"
+    conv_weight_std: false
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  rank: 0
+  world_size: -1
+  dist_port: 30786
+stats:
+  name: [ "loss", "iou"]
+  checkpoint_metric: "iou"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_xx_small.yaml b/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_xx_small.yaml
new file mode 100644
index 0000000..0ae808d
--- /dev/null
+++ b/MobileViTv3-v1/config/segmentation/deeplabv3_mobilevitv3_xx_small.yaml
@@ -0,0 +1,109 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 200
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/media/Datasets/VOCdevkit/"
+  root_val: "/media/Datasets/VOCdevkit/"
+  name: "pascal"
+  category: "segmentation"
+  train_batch_size0: 16
+  val_batch_size0: 16
+  eval_batch_size0: 1
+  workers: 12
+  persistent_workers: false
+  pin_memory: false
+  pascal:
+    use_coco_data: true
+    coco_root_dir: "/media/Datasets/coco_preprocess"
+image_augmentation:
+  random_resize:
+    enable: true
+    min_size: 256
+    max_size: 1024
+  random_crop:
+    enable: true
+    mask_fill: 255
+    resize_if_needed: true
+  random_horizontal_flip:
+    enable: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 512
+    crop_size_height: 512
+loss:
+  category: "segmentation"
+  ignore_idx: 255
+  segmentation:
+    name: "cross_entropy"
+optim:
+  name: "adamw"
+  weight_decay: 0.01
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 50
+  warmup_iterations: 500
+  warmup_init_lr: 0.00009
+  cosine:
+    max_lr: 0.0009 # [2.7e-3 * N_GPUS^2 x (BATCH_SIZE_GPU0/ 32) * 0.02 ] # 0.02 comes from this fact 0.1 (ResNet SGD LR)/0.002 (MIT ADAMW LR)
+    min_lr: 1.e-6
+model:
+  segmentation:
+    name: "encoder_decoder"
+    lr_multiplier: 10
+    seg_head: "deeplabv3"
+    output_stride: 16
+    classifier_dropout: 0.1
+    activation:
+      name: "relu"
+    deeplabv3:
+      aspp_dropout: 0.1
+      aspp_sep_conv: false
+      aspp_out_channels: 256
+      aspp_rates: [6, 12, 18]
+  classification:
+    name: "mobilevit_v3"
+    classifier_dropout: 0.1
+    mit:
+      mode: "xx_small_v3" 
+      ffn_dropout: 0.0
+      attn_dropout: 0.0
+      dropout: 0.05
+      number_heads: 4
+      no_fuse_local_global_features: false
+      conv_kernel_size: 3
+    activation:
+      name: "swish"
+    pretrained: "results/mobilevitv3_xx_e300_7098/run_1/checkpoint_ema_best.pt"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "relu"
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "normal"
+    conv_weight_std: false
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  enable: true
+  rank: 0
+  world_size: -1
+  dist_port: 30786
+stats:
+  name: [ "loss", "iou"]
+  checkpoint_metric: "iou"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v1/cvnets/models/classification/config/mobilevit.py b/MobileViTv3-v1/cvnets/models/classification/config/mobilevit.py
new file mode 100644
index 0000000..d93a2ac
--- /dev/null
+++ b/MobileViTv3-v1/cvnets/models/classification/config/mobilevit.py
@@ -0,0 +1,553 @@
+# For licensing see accompanying LICENSE file.
+
+from typing import Dict
+
+from utils import logger
+
+
+def get_configuration(opts) -> Dict:
+    mode = getattr(opts, "model.classification.mit.mode", "small")
+    if mode is None:
+        logger.error("Please specify mode")
+
+    head_dim = getattr(opts, "model.classification.mit.head_dim", None)
+    num_heads = getattr(opts, "model.classification.mit.number_heads", None)
+    if head_dim is not None:
+        if num_heads is not None:
+            logger.error(
+                "--model.classification.mit.head-dim and --model.classification.mit.number-heads "
+                "are mutually exclusive."
+            )
+    elif num_heads is not None:
+        if head_dim is not None:
+            logger.error(
+                "--model.classification.mit.head-dim and --model.classification.mit.number-heads "
+                "are mutually exclusive."
+            )
+    mode = mode.lower()
+    if mode == 'xx_small':
+        mv2_exp_mult = 2
+        config = {
+            "layer1": {
+                "out_channels": 16,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 24,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 48,
+                "transformer_channels": 64,
+                "ffn_dim": 128,
+                "transformer_blocks": 2,
+                "patch_h": 2,  
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 64,
+                "transformer_channels": 80,
+                "ffn_dim": 160,
+                "transformer_blocks": 4,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 80,
+                "transformer_channels": 96,
+                "ffn_dim": 192,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    elif mode == 'xx_small_v3':
+        mv2_exp_mult = 2
+        config = {
+            "layer1": {
+                "out_channels": 16,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 24,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 64,
+                "transformer_channels": 64,
+                "ffn_dim": 128,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 80,
+                "transformer_channels": 80,
+                "ffn_dim": 160,
+                "transformer_blocks": 4,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 128,
+                "transformer_channels": 96,
+                "ffn_dim": 192,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    elif mode == 'xx_small_v3_fast':
+        mv2_exp_mult = 2
+        config = {
+            "layer1": {
+                "out_channels": 16,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 24,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 64,
+                "transformer_channels": 64,
+                "ffn_dim": 128,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 80,
+                "transformer_channels": 80,
+                "ffn_dim": 160,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 128,
+                "transformer_channels": 96,
+                "ffn_dim": 192,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    elif mode == 'x_small':
+        mv2_exp_mult = 4
+        config = {
+            "layer1": {
+                "out_channels": 32,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 48,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 64,
+                "transformer_channels": 96,
+                "ffn_dim": 192,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 80,
+                "transformer_channels": 120,
+                "ffn_dim": 240,
+                "transformer_blocks": 4,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 96,
+                "transformer_channels": 144,
+                "ffn_dim": 288,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    elif mode == 'x_small_v3':
+        mv2_exp_mult = 4
+        config = {
+            "layer1": {
+                "out_channels": 32,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 48,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 96,
+                "transformer_channels": 96,
+                "ffn_dim": 192,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 160, 
+                "transformer_channels": 120,
+                "ffn_dim": 240,
+                "transformer_blocks": 4,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 160, 
+                "transformer_channels": 144,
+                "ffn_dim": 288,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    elif mode == 'x_small_v3_fast':
+        mv2_exp_mult = 4
+        config = {
+            "layer1": {
+                "out_channels": 32,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 48,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 96,
+                "transformer_channels": 96,
+                "ffn_dim": 192,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 160,
+                "transformer_channels": 120,
+                "ffn_dim": 240,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 160,
+                "transformer_channels": 144,
+                "ffn_dim": 288,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    elif mode == "small":
+        mv2_exp_mult = 4
+        config = {
+            "layer1": {
+                "out_channels": 32,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 64,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 96,
+                "transformer_channels": 144,
+                "ffn_dim": 288,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 128,
+                "transformer_channels": 192,
+                "ffn_dim": 384,
+                "transformer_blocks": 4,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 160,
+                "transformer_channels": 240,
+                "ffn_dim": 480,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    elif mode == "small_v3":
+        mv2_exp_mult = 4
+        config = {
+            "layer1": {
+                "out_channels": 32,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 64,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 128,
+                "transformer_channels": 144,
+                "ffn_dim": 288,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 256,
+                "transformer_channels": 192,
+                "ffn_dim": 384,
+                "transformer_blocks": 4,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 320,
+                "transformer_channels": 240,
+                "ffn_dim": 480,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    elif mode == "small_v3_fast":
+        mv2_exp_mult = 4
+        config = {
+            "layer1": {
+                "out_channels": 32,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 1,
+                "stride": 1,
+                "block_type": "mv2"
+            },
+            "layer2": {
+                "out_channels": 64,
+                "expand_ratio": mv2_exp_mult,
+                "num_blocks": 3,
+                "stride": 2,
+                "block_type": "mv2"
+            },
+            "layer3": {  # 28x28
+                "out_channels": 128,
+                "transformer_channels": 144,
+                "ffn_dim": 288,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer4": {  # 14x14
+                "out_channels": 256,
+                "transformer_channels": 192,
+                "ffn_dim": 384,
+                "transformer_blocks": 2,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "layer5": {  # 7x7
+                "out_channels": 320,
+                "transformer_channels": 240,
+                "ffn_dim": 480,
+                "transformer_blocks": 3,
+                "patch_h": 2,
+                "patch_w": 2,
+                "stride": 2,
+                "mv_expand_ratio": mv2_exp_mult,
+                "head_dim": head_dim,
+                "num_heads": num_heads,
+                "block_type": "mobilevit"
+            },
+            "last_layer_exp_factor": 4
+        }
+    else:
+        raise NotImplementedError
+
+    return config
diff --git a/MobileViTv3-v1/cvnets/models/classification/mobilevit.py b/MobileViTv3-v1/cvnets/models/classification/mobilevit.py
new file mode 100644
index 0000000..f7680b3
--- /dev/null
+++ b/MobileViTv3-v1/cvnets/models/classification/mobilevit.py
@@ -0,0 +1,223 @@
+#
+# For licensing see accompanying LICENSE file.
+#
+
+from torch import nn
+import argparse
+from typing import Dict, Tuple, Optional
+
+from utils import logger
+
+from . import register_cls_models
+from .base_cls import BaseEncoder
+from .config.mobilevit import get_configuration
+from ...layers import ConvLayer, LinearLayer, GlobalPool, Dropout, SeparableConv
+from ...modules import InvertedResidual, MobileViTv3Block
+
+
+@register_cls_models("mobilevit_v3")
+class MobileViTv3(BaseEncoder):
+    """
+        MobileViTv3:
+    """
+    def __init__(self, opts, *args, **kwargs) -> None:
+        num_classes = getattr(opts, "model.classification.n_classes", 1000)
+        classifier_dropout = getattr(opts, "model.classification.classifier_dropout", 0.2)
+
+        pool_type = getattr(opts, "model.layer.global_pool", "mean")
+        image_channels = 3
+        out_channels = 16
+
+        mobilevit_config = get_configuration(opts=opts)
+
+        # Segmentation architectures like Deeplab and PSPNet modifies the strides of the classification backbones
+        # We allow that using `output_stride` arguments
+        output_stride = kwargs.get("output_stride", None)
+        dilate_l4 = dilate_l5 = False
+        if output_stride == 8:
+            dilate_l4 = True
+            dilate_l5 = True
+        elif output_stride == 16:
+            dilate_l5 = True
+
+        super(MobileViTv3, self).__init__()
+        self.dilation = 1
+
+        # store model configuration in a dictionary
+        self.model_conf_dict = dict()
+        self.conv_1 = ConvLayer(
+                opts=opts, in_channels=image_channels, out_channels=out_channels,
+                kernel_size=3, stride=2, use_norm=True, use_act=True
+            )
+
+        self.model_conf_dict['conv1'] = {'in': image_channels, 'out': out_channels}
+
+        in_channels = out_channels
+        self.layer_1, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer1"]
+        )
+        self.model_conf_dict['layer1'] = {'in': in_channels, 'out': out_channels}
+
+        in_channels = out_channels
+        self.layer_2, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer2"]
+        )
+        self.model_conf_dict['layer2'] = {'in': in_channels, 'out': out_channels}
+
+        in_channels = out_channels
+        self.layer_3, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer3"]
+        )
+        self.model_conf_dict['layer3'] = {'in': in_channels, 'out': out_channels}
+
+        in_channels = out_channels
+        self.layer_4, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer4"], dilate=dilate_l4
+        )
+        self.model_conf_dict['layer4'] = {'in': in_channels, 'out': out_channels}
+
+        in_channels = out_channels
+        self.layer_5, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer5"], dilate=dilate_l5
+        )
+        self.model_conf_dict['layer5'] = {'in': in_channels, 'out': out_channels}
+
+        in_channels = out_channels
+        exp_channels = min(mobilevit_config["last_layer_exp_factor"] * in_channels, 960)
+        self.conv_1x1_exp = ConvLayer(
+                opts=opts, in_channels=in_channels, out_channels=exp_channels,
+                kernel_size=1, stride=1, use_act=True, use_norm=True
+            )
+
+        self.model_conf_dict['exp_before_cls'] = {'in': in_channels, 'out': exp_channels}
+
+        self.classifier = nn.Sequential()
+        self.classifier.add_module(name="global_pool", module=GlobalPool(pool_type=pool_type, keep_dim=False))
+        if 0.0 < classifier_dropout < 1.0:
+            self.classifier.add_module(name="dropout", module=Dropout(p=classifier_dropout, inplace=True))
+        self.classifier.add_module(
+            name="fc",
+            module=LinearLayer(in_features=exp_channels, out_features=num_classes, bias=True)
+        )
+
+        # check model
+        self.check_model()
+
+        # weight initialization
+        self.reset_parameters(opts=opts)
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(title="".format(cls.__name__), description="".format(cls.__name__))
+        group.add_argument('--model.classification.mit.mode', type=str, default=None,
+                           choices=['xx_small', 'x_small', 'small'], help="MIT mode")
+        group.add_argument('--model.classification.mit.attn-dropout', type=float, default=0.1,
+                           help="Dropout in attention layer")
+        group.add_argument('--model.classification.mit.ffn-dropout', type=float, default=0.0,
+                           help="Dropout between FFN layers")
+        group.add_argument('--model.classification.mit.dropout', type=float, default=0.1,
+                           help="Dropout in Transformer layer")
+        group.add_argument('--model.classification.mit.transformer-norm-layer', type=str, default="layer_norm",
+                           help="Normalization layer in transformer")
+        group.add_argument('--model.classification.mit.no-fuse-local-global-features', action="store_true",
+                           help="Do not combine local and global features in MIT block")
+        group.add_argument('--model.classification.mit.conv-kernel-size', type=int, default=3,
+                           help="Kernel size of Conv layers in MIT block")
+
+        group.add_argument('--model.classification.mit.head-dim', type=int, default=None,
+                           help="Head dimension in transformer")
+        group.add_argument('--model.classification.mit.number-heads', type=int, default=None,
+                           help="No. of heads in transformer")
+        return parser
+
+    def _make_layer(self, opts, input_channel, cfg: Dict, dilate: Optional[bool] = False) -> Tuple[nn.Sequential, int]:
+        block_type = cfg.get("block_type", "mobilevit")
+        if block_type.lower() == "mobilevit":
+            return self._make_mit_layer(
+                opts=opts,
+                input_channel=input_channel,
+                cfg=cfg,
+                dilate=dilate
+            )
+        else:
+            return self._make_mobilenet_layer(
+                opts=opts,
+                input_channel=input_channel,
+                cfg=cfg
+            )
+
+    @staticmethod
+    def _make_mobilenet_layer(opts, input_channel: int, cfg: Dict) -> Tuple[nn.Sequential, int]:
+        output_channels = cfg.get("out_channels")
+        num_blocks = cfg.get("num_blocks", 2)
+        expand_ratio = cfg.get("expand_ratio", 4)
+        block = []
+
+        for i in range(num_blocks):
+            stride = cfg.get("stride", 1) if i == 0 else 1
+
+            layer = InvertedResidual(
+                opts=opts,
+                in_channels=input_channel,
+                out_channels=output_channels,
+                stride=stride,
+                expand_ratio=expand_ratio
+            )
+            block.append(layer)
+            input_channel = output_channels
+        return nn.Sequential(*block), input_channel
+
+    def _make_mit_layer(self, opts, input_channel, cfg: Dict, dilate: Optional[bool] = False) -> Tuple[nn.Sequential, int]:
+        prev_dilation = self.dilation
+        block = []
+        stride = cfg.get("stride", 1)
+
+        if stride == 2:
+            if dilate:
+                self.dilation *= 2
+                stride = 1
+
+            layer = InvertedResidual(
+                opts=opts,
+                in_channels=input_channel,
+                out_channels=cfg.get("out_channels"),
+                stride=stride,
+                expand_ratio=cfg.get("mv_expand_ratio", 4),
+                dilation=prev_dilation
+            )
+
+            block.append(layer)
+            input_channel = cfg.get("out_channels")
+
+        head_dim = cfg.get("head_dim", 32)
+        transformer_dim = cfg["transformer_channels"]
+        ffn_dim = cfg.get("ffn_dim")
+        if head_dim is None:
+            num_heads = cfg.get("num_heads", 4)
+            if num_heads is None:
+                num_heads = 4
+            head_dim = transformer_dim // num_heads
+
+        if transformer_dim % head_dim != 0:
+            logger.error("Transformer input dimension should be divisible by head dimension. "
+                         "Got {} and {}.".format(transformer_dim, head_dim))
+
+        block.append(
+            MobileViTv3Block(
+                opts=opts,
+                in_channels=input_channel,
+                transformer_dim=transformer_dim,
+                ffn_dim=ffn_dim,
+                n_transformer_blocks=cfg.get("transformer_blocks", 1),
+                patch_h=cfg.get("patch_h", 2),
+                patch_w=cfg.get("patch_w", 2),
+                dropout=getattr(opts, "model.classification.mit.dropout", 0.1),
+                ffn_dropout=getattr(opts, "model.classification.mit.ffn_dropout", 0.0),
+                attn_dropout=getattr(opts, "model.classification.mit.attn_dropout", 0.1),
+                head_dim=head_dim,
+                no_fusion=getattr(opts, "model.classification.mit.no_fuse_local_global_features", False),
+                conv_ksize=getattr(opts, "model.classification.mit.conv_kernel_size", 3)
+            )
+        )
+
+        return nn.Sequential(*block), input_channel
diff --git a/MobileViTv3-v1/cvnets/modules/__init__.py b/MobileViTv3-v1/cvnets/modules/__init__.py
new file mode 100644
index 0000000..8303815
--- /dev/null
+++ b/MobileViTv3-v1/cvnets/modules/__init__.py
@@ -0,0 +1,29 @@
+#
+# For licensing see accompanying LICENSE file.
+#
+
+from .base_module import BaseModule
+from .squeeze_excitation import SqueezeExcitation
+from .mobilenetv2 import InvertedResidual, InvertedResidualSE
+from .resnet import BasicResNetBlock, BottleneckResNetBlock
+from .aspp_block import ASPP
+from .transformer import TransformerEncoder
+from .ppm import PPM
+from .mobilevit_block import MobileViTv3Block
+from .feature_pyramid import FPModule
+from .ssd import SSDHead
+
+
+__all__ = [
+    'InvertedResidual',
+    'InvertedResidualSE',
+    'BasicResNetBlock',
+    'BottleneckResNetBlock',
+    'ASPP',
+    'TransformerEncoder',
+    'SqueezeExcitation',
+    'PPM',
+    'MobileViTv3Block',
+    'FPModule',
+    'SSDHead'
+]
diff --git a/MobileViTv3-v1/cvnets/modules/mobilevit_block.py b/MobileViTv3-v1/cvnets/modules/mobilevit_block.py
new file mode 100644
index 0000000..0b7d698
--- /dev/null
+++ b/MobileViTv3-v1/cvnets/modules/mobilevit_block.py
@@ -0,0 +1,238 @@
+# For licensing see accompanying LICENSE file.
+
+import numpy as np
+from torch import nn, Tensor
+import math
+import torch
+from torch.nn import functional as F
+from typing import Optional, Dict, Tuple
+
+from .transformer import TransformerEncoder
+from .base_module import BaseModule
+from ..misc.profiler import module_profile
+from ..layers import ConvLayer, get_normalization_layer
+
+
+class MobileViTv3Block(BaseModule):
+    """
+        MobileViTv3 block
+    """
+    def __init__(self, opts, in_channels: int, transformer_dim: int, ffn_dim: int,
+                 n_transformer_blocks: Optional[int] = 2,
+                 head_dim: Optional[int] = 32, attn_dropout: Optional[float] = 0.1,
+                 dropout: Optional[int] = 0.1, ffn_dropout: Optional[int] = 0.1, patch_h: Optional[int] = 8,
+                 patch_w: Optional[int] = 8, transformer_norm_layer: Optional[str] = "layer_norm",
+                 conv_ksize: Optional[int] = 3,
+                 dilation: Optional[int] = 1, var_ffn: Optional[bool] = False,
+                 no_fusion: Optional[bool] = False,
+                 *args, **kwargs):
+
+        # For MobileViTv3: Normal 3x3 convolution --> Depthwise 3x3 convolution
+        conv_3x3_in = ConvLayer(
+            opts=opts, in_channels=in_channels, out_channels=in_channels,
+            kernel_size=conv_ksize, stride=1, use_norm=True, use_act=True, dilation=dilation,
+            groups=in_channels
+        )
+        conv_1x1_in = ConvLayer(
+            opts=opts, in_channels=in_channels, out_channels=transformer_dim,
+            kernel_size=1, stride=1, use_norm=False, use_act=False
+        )
+
+
+        conv_1x1_out = ConvLayer(
+            opts=opts, in_channels=transformer_dim, out_channels=in_channels,
+            kernel_size=1, stride=1, use_norm=True, use_act=True
+        )
+        conv_3x3_out = None
+
+        # For MobileViTv3: input+global --> local+global
+        if not no_fusion:
+            #input_ch = tr_dim + in_ch
+            conv_3x3_out = ConvLayer(
+                opts=opts, in_channels= transformer_dim + in_channels, out_channels=in_channels,
+                kernel_size=1, stride=1, use_norm=True, use_act=True
+            )
+
+        super(MobileViTv3Block, self).__init__()
+        self.local_rep = nn.Sequential()
+        self.local_rep.add_module(name="conv_3x3", module=conv_3x3_in)
+        self.local_rep.add_module(name="conv_1x1", module=conv_1x1_in)
+
+        assert transformer_dim % head_dim == 0
+        num_heads = transformer_dim // head_dim
+
+        ffn_dims = [ffn_dim] * n_transformer_blocks
+
+        global_rep = [
+            TransformerEncoder(opts=opts, embed_dim=transformer_dim, ffn_latent_dim=ffn_dims[block_idx], num_heads=num_heads,
+                               attn_dropout=attn_dropout, dropout=dropout, ffn_dropout=ffn_dropout,
+                               transformer_norm_layer=transformer_norm_layer)
+            for block_idx in range(n_transformer_blocks)
+        ]
+        global_rep.append(
+            get_normalization_layer(opts=opts, norm_type=transformer_norm_layer, num_features=transformer_dim)
+        )
+        self.global_rep = nn.Sequential(*global_rep)
+
+        self.conv_proj = conv_1x1_out
+
+        self.fusion = conv_3x3_out
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+        self.patch_area = self.patch_w * self.patch_h
+
+        self.cnn_in_dim = in_channels
+        self.cnn_out_dim = transformer_dim
+        self.n_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.ffn_dropout = ffn_dropout
+        self.dilation = dilation
+        self.ffn_max_dim = ffn_dims[0]
+        self.ffn_min_dim = ffn_dims[-1]
+        self.var_ffn = var_ffn
+        self.n_blocks = n_transformer_blocks
+        self.conv_ksize = conv_ksize
+
+    def __repr__(self):
+        repr_str = "{}(".format(self.__class__.__name__)
+        repr_str += "\n\tconv_in_dim={}, conv_out_dim={}, dilation={}, conv_ksize={}".format(self.cnn_in_dim, self.cnn_out_dim, self.dilation, self.conv_ksize)
+        repr_str += "\n\tpatch_h={}, patch_w={}".format(self.patch_h, self.patch_w)
+        repr_str += "\n\ttransformer_in_dim={}, transformer_n_heads={}, transformer_ffn_dim={}, dropout={}, " \
+                    "ffn_dropout={}, attn_dropout={}, blocks={}".format(
+            self.cnn_out_dim,
+            self.n_heads,
+            self.ffn_dim,
+            self.dropout,
+            self.ffn_dropout,
+            self.attn_dropout,
+            self.n_blocks
+        )
+        if self.var_ffn:
+            repr_str += "\n\t var_ffn_min_mult={}, var_ffn_max_mult={}".format(
+                self.ffn_min_dim, self.ffn_max_dim
+            )
+
+        repr_str += "\n)"
+        return repr_str
+
+    def unfolding(self, feature_map: Tensor) -> Tuple[Tensor, Dict]:
+        patch_w, patch_h = self.patch_w, self.patch_h
+        patch_area = int(patch_w * patch_h)
+        batch_size, in_channels, orig_h, orig_w = feature_map.shape
+
+        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+
+        interpolate = False
+        if new_w != orig_w or new_h != orig_h:
+            # Note: Padding can be done, but then it needs to be handled in attention function.
+            feature_map = F.interpolate(feature_map, size=(new_h, new_w), mode="bilinear", align_corners=False)
+            interpolate = True
+
+        # number of patches along width and height
+        num_patch_w = new_w // patch_w # n_w
+        num_patch_h = new_h // patch_h # n_h
+        num_patches = num_patch_h * num_patch_w # N
+
+        # [B, C, H, W] --> [B * C * n_h, p_h, n_w, p_w]
+        reshaped_fm = feature_map.reshape(batch_size * in_channels * num_patch_h, patch_h, num_patch_w, patch_w)
+        # [B * C * n_h, p_h, n_w, p_w] --> [B * C * n_h, n_w, p_h, p_w]
+        transposed_fm = reshaped_fm.transpose(1, 2)
+        # [B * C * n_h, n_w, p_h, p_w] --> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
+        reshaped_fm = transposed_fm.reshape(batch_size, in_channels, num_patches, patch_area)
+        # [B, C, N, P] --> [B, P, N, C]
+        transposed_fm = reshaped_fm.transpose(1, 3)
+        # [B, P, N, C] --> [BP, N, C]
+        patches = transposed_fm.reshape(batch_size * patch_area, num_patches, -1)
+
+        info_dict = {
+            "orig_size": (orig_h, orig_w),
+            "batch_size": batch_size,
+            "interpolate": interpolate,
+            "total_patches": num_patches,
+            "num_patches_w": num_patch_w,
+            "num_patches_h": num_patch_h
+        }
+
+        return patches, info_dict
+
+    def folding(self, patches: Tensor, info_dict: Dict) -> Tensor:
+        n_dim = patches.dim()
+        assert n_dim == 3, "Tensor should be of shape BPxNxC. Got: {}".format(patches.shape)
+        # [BP, N, C] --> [B, P, N, C]
+        patches = patches.contiguous().view(info_dict["batch_size"], self.patch_area, info_dict["total_patches"], -1)
+
+        batch_size, pixels, num_patches, channels = patches.size()
+        num_patch_h = info_dict["num_patches_h"]
+        num_patch_w = info_dict["num_patches_w"]
+
+        # [B, P, N, C] --> [B, C, N, P]
+        patches = patches.transpose(1, 3)
+
+        # [B, C, N, P] --> [B*C*n_h, n_w, p_h, p_w]
+        feature_map = patches.reshape(batch_size * channels * num_patch_h, num_patch_w, self.patch_h, self.patch_w)
+        # [B*C*n_h, n_w, p_h, p_w] --> [B*C*n_h, p_h, n_w, p_w]
+        feature_map = feature_map.transpose(1, 2)
+        # [B*C*n_h, p_h, n_w, p_w] --> [B, C, H, W]
+        feature_map = feature_map.reshape(batch_size, channels, num_patch_h * self.patch_h, num_patch_w * self.patch_w)
+        if info_dict["interpolate"]:
+            feature_map = F.interpolate(feature_map, size=info_dict["orig_size"], mode="bilinear", align_corners=False)
+        return feature_map
+
+    def forward(self, x: Tensor) -> Tensor:
+        res = x
+
+        # For MobileViTv3: Normal 3x3 convolution --> Depthwise 3x3 convolution
+        fm_conv = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, info_dict = self.unfolding(fm_conv)
+
+        # learn global representations
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, info_dict=info_dict)
+
+        fm = self.conv_proj(fm)
+
+        if self.fusion is not None:
+            # For MobileViTv3: input+global --> local+global
+            fm = self.fusion(
+                torch.cat((fm_conv, fm), dim=1)
+            )
+
+        # For MobileViTv3: Skip connection
+        fm = fm + res
+
+        return fm
+
+    def profile_module(self, input: Tensor) -> (Tensor, float, float):
+        params = macs = 0.0
+
+        res = input
+        out_conv, p, m = module_profile(module=self.local_rep, x=input)
+        params += p
+        macs += m
+
+        patches, info_dict = self.unfolding(feature_map=out_conv)
+
+        patches, p, m = module_profile(module=self.global_rep, x=patches)
+        params += p
+        macs += m
+
+        fm = self.folding(patches=patches, info_dict=info_dict)
+
+        out, p, m = module_profile(module=self.conv_proj, x=fm)
+        params += p
+        macs += m
+
+        if self.fusion is not None:
+            out, p, m = module_profile(module=self.fusion, x=torch.cat((out, out_conv), dim=1))
+            params += p
+            macs += m
+
+        return res, params, macs
diff --git a/MobileViTv3-v1/environment_cvnet.yml b/MobileViTv3-v1/environment_cvnet.yml
new file mode 100644
index 0000000..3abd470
--- /dev/null
+++ b/MobileViTv3-v1/environment_cvnet.yml
@@ -0,0 +1,153 @@
+name: cvnet
+channels:
+  - pytorch
+  - anaconda
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=1_gnu
+  - blas=1.0=mkl
+  - blosc=1.21.0=h8c45485_0
+  - brotli=1.0.9=he6710b0_2
+  - brotlipy=0.7.0=py39h27cfd23_1003
+  - brunsli=0.1=h2531618_0
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2021.10.8=ha878542_0
+  - certifi=2021.10.8=py39hf3d152e_2
+  - cffi=1.15.0=py39hd667e15_1
+  - cfitsio=3.470=hf0d0db6_6
+  - charls=2.2.0=h2531618_0
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - cloudpickle=1.6.0=py_0
+  - cryptography=36.0.0=py39h9ce1e76_0
+  - cudatoolkit=11.3.1=h2bc3f7f_2
+  - cycler=0.11.0=pyhd8ed1ab_0
+  - cython=0.29.28=py39h295c915_0
+  - cytoolz=0.11.0=py39h27cfd23_0
+  - dask-core=2.30.0=py_0
+  - decorator=4.4.2=py_0
+  - ffmpeg=4.3=hf484d3e_0
+  - freetype=2.11.0=h70c0345_0
+  - giflib=5.2.1=h7b6447c_0
+  - gmp=6.2.1=h2531618_2
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.3=pyhd3eb1b0_0
+  - imagecodecs=2021.8.26=py39h4cda21f_0
+  - imageio=2.9.0=py_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - joblib=0.17.0=py_0
+  - jpeg=9d=h7f8727e_0
+  - jxrlib=1.1=h7b6447c_2
+  - kiwisolver=1.3.2=py39h295c915_0
+  - krb5=1.18.2=h173b8e3_0
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.35.1=h7274673_9
+  - lerc=3.0=h295c915_0
+  - libaec=1.0.4=he6710b0_1
+  - libcurl=7.71.1=h20c2e04_1
+  - libdeflate=1.8=h7f8727e_5
+  - libedit=3.1.20191231=h14c3975_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=11.2.0=h1d223b6_14
+  - libgfortran-ng=7.5.0=ha8ba4b0_17
+  - libgfortran4=7.5.0=ha8ba4b0_17
+  - libgomp=11.2.0=h1d223b6_14
+  - libiconv=1.15=h63c8f33_5
+  - libidn2=2.3.2=h7f8727e_0
+  - libpng=1.6.37=hbc83047_0
+  - libssh2=1.9.0=h1ba5d50_1
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - libtasn1=4.16.0=h27cfd23_0
+  - libtiff=4.2.0=h85742a9_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuv=1.40.0=h7b6447c_0
+  - libwebp=1.2.2=h55f646e_0
+  - libwebp-base=1.2.2=h7f8727e_0
+  - libzopfli=1.0.3=he6710b0_0
+  - lz4-c=1.9.3=h295c915_1
+  - matplotlib-base=3.3.4=py39h2fa2bec_0
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py39h7f8727e_0
+  - mkl_fft=1.3.1=py39hd3c417c_0
+  - mkl_random=1.2.2=py39h51133e4_0
+  - ncurses=6.3=h7f8727e_2
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=2.5=py_0
+  - numpy=1.21.2=py39h20f2e39_0
+  - numpy-base=1.21.2=py39h79a1101_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.3.0=h05c96fa_1
+  - openssl=1.1.1n=h166bdaf_0
+  - packaging=20.4=py_0
+  - pillow=9.0.1=py39h22f2fdc_0
+  - pip=21.2.4=py39h06a4308_0
+  - pycocotools=2.0.4=py39hce5d2b2_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=22.0.0=pyhd3eb1b0_0
+  - pyparsing=2.4.7=py_0
+  - pysocks=1.7.1=py39h06a4308_0
+  - python=3.9.12=h12debd9_0
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.9=2_cp39
+  - pytorch=1.11.0=py3.9_cuda11.3_cudnn8.2.0_0
+  - pytorch-mutex=1.0=cuda
+  - pywavelets=1.3.0=py39h7f8727e_0
+  - pyyaml=6.0=py39h7f8727e_1
+  - readline=8.1.2=h7f8727e_1
+  - requests=2.27.1=pyhd3eb1b0_0
+  - scikit-image=0.19.2=py39h51133e4_0
+  - scikit-learn=1.0.2=py39h51133e4_1
+  - scipy=1.7.3=py39hc147768_0
+  - setuptools=58.0.4=py39h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - snappy=1.1.8=he6710b0_0
+  - sqlite=3.38.2=hc218d9a_0
+  - threadpoolctl=2.1.0=pyh5ca1d4c_0
+  - tifffile=2020.10.1=py_0
+  - tk=8.6.11=h1ccaba5_0
+  - toolz=0.11.1=py_0
+  - torchvision=0.12.0=py39_cu113
+  - tornado=6.1=py39hb9d737c_3
+  - typing_extensions=4.1.1=pyh06a4308_0
+  - tzdata=2022a=hda174b7_0
+  - ujson=5.1.0=py39h295c915_0
+  - urllib3=1.26.8=pyhd3eb1b0_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.5=h7b6447c_0
+  - yaml=0.2.5=h7b6447c_0
+  - zfp=0.5.5=h295c915_6
+  - zlib=1.2.11=h7f8727e_4
+  - zstd=1.4.9=haebb681_0
+  - pip:
+    - absl-py==1.0.0
+    - appdirs==1.4.4
+    - cachetools==5.0.0
+    - cityscapesscripts==2.2.0
+    - coloredlogs==15.0.1
+    - coremltools==5.2.0
+    - google-auth==2.6.6
+    - google-auth-oauthlib==0.4.6
+    - grpcio==1.44.0
+    - humanfriendly==10.0
+    - importlib-metadata==4.11.3
+    - markdown==3.3.6
+    - mpmath==1.2.1
+    - oauthlib==3.2.0
+    - opencv-contrib-python==4.5.5.64
+    - protobuf==3.20.0
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pyquaternion==0.9.9
+    - requests-oauthlib==1.3.1
+    - rsa==4.8
+    - sympy==1.10.1
+    - tensorboard==2.8.0
+    - tensorboard-data-server==0.6.1
+    - tensorboard-plugin-wit==1.8.1
+    - tqdm==4.64.0
+    - typing==3.7.4.3
+    - werkzeug==2.1.1
+    - zipp==3.8.0
+prefix: ~/anaconda3/envs/cvnet
diff --git a/MobileViTv3-v1/main_latency.py b/MobileViTv3-v1/main_latency.py
new file mode 100644
index 0000000..172f45e
--- /dev/null
+++ b/MobileViTv3-v1/main_latency.py
@@ -0,0 +1,183 @@
+# For licensing see accompanying LICENSE file.
+
+import multiprocessing
+import torch
+from utils import logger
+from options.opts import get_training_arguments
+from utils.common_utils import device_setup, create_directories
+from utils.ddp_utils import is_master, distributed_init
+from cvnets import get_model, EMA
+from loss_fn import build_loss_fn
+from optim import build_optimizer
+from optim.scheduler import build_scheduler
+from data import create_train_val_loader
+from utils.checkpoint_utils import load_checkpoint, load_model_state
+from engine import Trainer
+import math
+from torch.cuda.amp import GradScaler
+from common import DEFAULT_EPOCHS, DEFAULT_ITERATIONS, DEFAULT_MAX_ITERATIONS, DEFAULT_MAX_EPOCHS
+
+#import torch.utils.benchmark as benchmark
+import numpy as np
+
+@torch.no_grad()
+def run_inference(model, input_tensor):
+    return model(input_tensor)
+
+def main(opts, **kwargs):
+    num_gpus = getattr(opts, "dev.num_gpus", 0) # defaults are for CPU
+    dev_id = getattr(opts, "dev.device_id", torch.device('cpu'))
+    device = getattr(opts, "dev.device", torch.device('cpu'))
+
+    is_master_node = is_master(opts)
+
+    # set-up data loaders
+    train_loader, val_loader, train_sampler = create_train_val_loader(opts)
+
+    # set-up the model
+    model = get_model(opts)
+
+    model = model.to(device=device)
+    model.eval()
+
+    input_tensor = torch.randn(1,3,256,256, dtype=torch.float).to(device)
+
+    batch_size = 100
+    input_tensor_t = torch.randn(batch_size,3,256,256, dtype=torch.float).to(device)
+
+    # reference: https://deci.ai/blog/measure-inference-time-deep-neural-networks
+
+    # initialize
+    starter = torch.cuda.Event(enable_timing=True)
+    ender = torch.cuda.Event(enable_timing=True)
+    repetitions = 10000
+    timings = np.zeros((repetitions,1))
+    total_time = 0
+
+    # GPU warm-up
+    for _ in range(10):
+        _ = model(input_tensor)
+
+    # Latency
+    # Measure performance
+    with torch.no_grad():
+        for rep in range(repetitions):
+            starter.record()
+            _ = model(input_tensor)
+            ender.record()
+            # wait for gpu sync
+            torch.cuda.synchronize()
+            curr_time = starter.elapsed_time(ender)
+            timings[rep] = curr_time
+
+    mean_syn = np.sum(timings) / repetitions
+    std_syn = np.std(timings)
+    print(f"Mean Latency: {mean_syn}")
+
+    # Throughput
+    # Measure performance
+    repetitions = 1000
+    with torch.no_grad():
+        for rep in range(repetitions):
+            starter.record()
+            _ = model(input_tensor_t)
+            ender.record()
+            # wait for gpu sync
+            torch.cuda.synchronize()
+            curr_time = starter.elapsed_time(ender)
+            total_time += curr_time / 1000
+
+    throughput = repetitions * batch_size / total_time
+    print(f"Throughput: {throughput}")
+
+
+
+
+def distributed_worker(i, main, opts, kwargs):
+    setattr(opts, "dev.device_id", i)
+    if torch.cuda.is_available():
+        torch.cuda.set_device(i)
+
+    ddp_rank = getattr(opts, "ddp.rank", None)
+    if ddp_rank is None:  # torch.multiprocessing.spawn
+        ddp_rank = kwargs.get('start_rank', 0) + i
+        setattr(opts, "ddp.rank", ddp_rank)
+
+    node_rank = distributed_init(opts)
+    setattr(opts, "ddp.rank", node_rank)
+    main(opts, **kwargs)
+
+
+def main_worker(**kwargs):
+    opts = get_training_arguments()
+    print(opts)
+    # device set-up
+    opts = device_setup(opts)
+
+    node_rank = getattr(opts, "ddp.rank", 0)
+    if node_rank < 0:
+        logger.error('--rank should be >=0. Got {}'.format(node_rank))
+
+    is_master_node = is_master(opts)
+
+    # create the directory for saving results
+    save_dir = getattr(opts, "common.results_loc", "results")
+    run_label = getattr(opts, "common.run_label", "run_1")
+    exp_dir = '{}/{}'.format(save_dir, run_label)
+    setattr(opts, "common.exp_loc", exp_dir)
+    create_directories(dir_path=exp_dir, is_master_node=is_master_node)
+
+    num_gpus = getattr(opts, "dev.num_gpus", 1)
+    world_size = getattr(opts, "ddp.world_size", -1)
+    use_distributed = getattr(opts, "ddp.enable", False)
+    if num_gpus <= 1:
+        use_distributed = False
+    setattr(opts, "ddp.use_distributed", use_distributed)
+
+    # No of data workers = no of CPUs (if not specified or -1)
+    n_cpus = multiprocessing.cpu_count()
+    dataset_workers = getattr(opts, "dataset.workers", -1)
+
+    norm_name = getattr(opts, "model.normalization.name", "batch_norm")
+    if use_distributed:
+        if world_size == -1:
+            logger.log("Setting --ddp.world-size the same as the number of available gpus")
+            world_size = num_gpus
+            setattr(opts, "ddp.world_size", world_size)
+        elif world_size != num_gpus:
+            logger.log("--ddp.world-size does not match num. available GPUs. Got {} !={}".format(world_size, num_gpus))
+            logger.log("Setting --ddp.world-size=num_gpus")
+            world_size = num_gpus
+            setattr(opts, "ddp.world_size", world_size)
+
+        if dataset_workers == -1 or dataset_workers is None:
+            setattr(opts, "dataset.workers", n_cpus // world_size)
+
+        start_rank = getattr(opts, "ddp.rank", 0)
+        setattr(opts, "ddp.rank", None)
+        kwargs['start_rank'] = start_rank
+        torch.multiprocessing.spawn(
+            fn=distributed_worker,
+            args=(main, opts, kwargs),
+            nprocs=num_gpus,
+        )
+    else:
+        if dataset_workers == -1:
+            setattr(opts, "dataset.workers", n_cpus)
+
+        if norm_name in ["sync_batch_norm", "sbn"]:
+            setattr(opts, "model.normalization.name", "batch_norm")
+
+        # adjust the batch size
+        train_bsize = getattr(opts, "dataset.train_batch_size0", 32) * max(1, num_gpus)
+        val_bsize = getattr(opts, "dataset.val_batch_size0", 32) * max(1, num_gpus)
+        setattr(opts, "dataset.train_batch_size0", train_bsize)
+        setattr(opts, "dataset.val_batch_size0", val_bsize)
+        setattr(opts, "dev.device_id", None)
+        main(opts=opts, **kwargs)
+
+
+if __name__ == "__main__":
+    #multiprocessing.set_start_method('spawn', force=True)
+
+    main_worker()
diff --git a/MobileViTv3-v1/setup.py b/MobileViTv3-v1/setup.py
new file mode 100644
index 0000000..9de4814
--- /dev/null
+++ b/MobileViTv3-v1/setup.py
@@ -0,0 +1,77 @@
+#
+# For licensing see accompanying LICENSE file.
+#
+
+import os
+import sys
+from setuptools import find_packages, setup
+
+if sys.version_info < (3, 6):
+    sys.exit("Sorry, Python >= 3.6 is required for cvnets.")
+
+if sys.platform == "darwin":
+    extra_compile_args = ["-stdlib=libc++", "-O3"]
+else:
+    extra_compile_args = ["-std=c++11", "-O3"]
+
+VERSION = 0.1
+
+
+def do_setup(package_data):
+    setup(
+        name="cvnets",
+        version=VERSION,
+        description="CVNets: A library for training computer vision networks",
+        url="https://github.com/apple/ml-cvnets.git",
+        setup_requires=[
+            'numpy<1.20.0; python_version<"3.7"',
+            'numpy; python_version>="3.7"',
+            "setuptools>=18.0",
+        ],
+        install_requires=[
+            'numpy<1.20.0; python_version<"3.7"',
+            'numpy; python_version>="3.7"',
+            "torch",
+            "tqdm",
+        ],
+        packages=find_packages(
+            exclude=[
+                "config_files",
+                "config_files.*"
+            ]
+        ),
+        package_data=package_data,
+        test_suite="tests",
+        entry_points={
+            "console_scripts": [
+                "cvnets-train = main_train:main_worker",
+                "cvnets-train-dist = main_train_dist:main_worker",
+                "cvnets-eval = main_eval:main_worker",
+                "cvnets-eval-seg = main_eval:main_worker_segmentation",
+                "cvnets-eval-det = main_eval:main_worker_detection",
+                "cvnets-convert = main_conversion:main_worker_conversion",
+                "cvnets-latency = main_latency:main_worker"
+            ],
+        },
+        zip_safe=False,
+    )
+
+
+def get_files(path, relative_to="."):
+    all_files = []
+    for root, _dirs, files in os.walk(path, followlinks=True):
+        root = os.path.relpath(root, relative_to)
+        for file in files:
+            if file.endswith(".pyc"):
+                continue
+            all_files.append(os.path.join(root, file))
+    return all_files
+
+
+if __name__ == "__main__":
+    package_data = {
+        "cvnets": (
+            get_files(os.path.join("cvnets", "config"))
+        )
+    }
+    do_setup(package_data)
diff --git a/MobileViTv3-v2/config/classification/finetune_higher_res_in1k/mobilevit_v3.yaml b/MobileViTv3-v2/config/classification/finetune_higher_res_in1k/mobilevit_v3.yaml
new file mode 100644
index 0000000..d972a74
--- /dev/null
+++ b/MobileViTv3-v2/config/classification/finetune_higher_res_in1k/mobilevit_v3.yaml
@@ -0,0 +1,80 @@
+common:
+  run_label: "run_1"
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+  channels_last: true
+  tensorboard_logging: false
+  grad_clip: 10.0
+dataset:
+  root_train: "/media/Datasets/ILSVRC2012-raw/train"
+  root_val: "/media/Datasets/ILSVRC2012-raw/val"
+  name: "imagenet"
+  category: "classification"
+  train_batch_size0: 64 # effective batch size of 128 (64 x 2 GPUs)
+  val_batch_size0: 50
+  eval_batch_size0: 50
+  workers: 8
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  random_resized_crop:
+    enable: true
+    interpolation: "bicubic"
+  random_horizontal_flip:
+    enable: true
+  resize:
+    enable: true
+    size: 384 # shorter size is 384
+    interpolation: "bicubic"
+  center_crop:
+    enable: true
+    size: 384
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 384
+    crop_size_height: 384
+loss:
+  category: "classification"
+  classification:
+    name: "cross_entropy"
+    label_smoothing: 0.1
+optim:
+  name: "sgd"
+  weight_decay: 4.e-5
+  no_decay_bn_filter_bias: true
+  sgd:
+    momentum: 0.9
+scheduler:
+  name: "fixed"
+  max_epochs: 10
+  fixed:
+    lr: 1.e-3
+model:
+  classification:
+    name: "mobilevit_v3"
+    mitv3:
+      width_multiplier: 0.5
+      attn_norm_layer: "layer_norm_2d"
+    activation:
+      name: "swish"
+  normalization:
+    name: "batch_norm"
+    momentum: 0.1
+  activation:
+    name: "swish"
+ema:
+  enable: true
+  momentum: 0.00005
+ddp:
+  disable: True  # False to used ddp
+  dist_url: "tcp://XX.XXX.XXX.XXX:XXXX"
+  rank: 0
+  world_size: 6
+  dist_port: 30768
+stats:
+  val: [ "loss", "top1", "top5" ]
+  train: ["loss"]
+  checkpoint_metric: "top1"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v2/config/classification/imagenet/mobilevit_v3_multiserver.py b/MobileViTv3-v2/config/classification/imagenet/mobilevit_v3_multiserver.py
new file mode 100644
index 0000000..79d4cdd
--- /dev/null
+++ b/MobileViTv3-v2/config/classification/imagenet/mobilevit_v3_multiserver.py
@@ -0,0 +1,103 @@
+common:
+  run_label: "run_1"
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+  channels_last: true
+  tensorboard_logging: false
+  grad_clip: 10.0
+dataset:
+  root_train: "/media/Datasets/ILSVRC2012-raw/train"
+  root_val: "/media/Datasets/ILSVRC2012-raw/val"
+  name: "imagenet"
+  category: "classification"
+  train_batch_size0: 85
+  val_batch_size0: 85
+  eval_batch_size0: 1
+  workers: 6
+  prefetch_factor: 2
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  random_resized_crop:
+    enable: true
+    interpolation: "bicubic"
+  random_horizontal_flip:
+    enable: true
+  rand_augment:
+    enable: true
+  random_erase:
+    enable: true
+    p: 0.25
+  mixup:
+    enable: true
+    alpha: 0.2
+  cutmix:
+    enable: true
+    alpha: 1.0
+  resize:
+    enable: true
+    size: 288 # shorter size is 288
+    interpolation: "bicubic"
+  center_crop:
+    enable: true
+    size: 256
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 256
+    crop_size_height: 256
+loss:
+  category: "classification"
+  classification:
+    name: "cross_entropy"
+    label_smoothing: 0.1
+optim:
+  name: "adamw"
+  weight_decay: 0.05
+  no_decay_bn_filter_bias: true
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 300
+  warmup_iterations: 20000
+  warmup_init_lr: 1.e-6
+  cosine:
+    max_lr: 0.002
+    min_lr: 0.0002
+model:
+  classification:
+    name: "mobilevit_v3"
+    mitv3:
+      width_multiplier: 1.00
+      attn_norm_layer: "layer_norm_2d"
+    activation:
+      name: "swish"
+  normalization:
+    name: "batch_norm"
+    momentum: 0.1
+  activation:
+    name: "swish"
+  layer:
+    global_pool: "mean"
+    conv_init:  "kaiming_normal"
+    conv_init_std_dev: 0.02
+    linear_init: "trunc_normal"
+    linear_init_std_dev: 0.02
+ema:
+  enable: true
+  momentum: 0.0005
+ddp:
+  disable: false # false if ddp is used
+  dist_url: "tcp://XX.XXX.XXX.XXX:XXXX" # ip address of server with rank 0
+  rank: 0   # unique rank for each server
+  world_size: 6
+  dist_port: 30768
+stats:
+  val: [ "loss", "top1", "top5" ]
+  train: ["loss"]
+  checkpoint_metric: "top1"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v2/config/classification/imagenet/mobilevit_v3_oneserver.py b/MobileViTv3-v2/config/classification/imagenet/mobilevit_v3_oneserver.py
new file mode 100644
index 0000000..a07097b
--- /dev/null
+++ b/MobileViTv3-v2/config/classification/imagenet/mobilevit_v3_oneserver.py
@@ -0,0 +1,97 @@
+common:
+  run_label: "run_1"
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+  channels_last: true
+  tensorboard_logging: false
+  grad_clip: 10.0
+dataset:
+  root_train: "/media/Datasets/ILSVRC2012-raw/train"
+  root_val: "/media/Datasets/ILSVRC2012-raw/val"
+  name: "imagenet"
+  category: "classification"
+  train_batch_size0: 80
+  val_batch_size0: 80
+  eval_batch_size0: 80
+  workers: 6
+  prefetch_factor: 2
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  random_resized_crop:
+    enable: true
+    interpolation: "bicubic"
+  random_horizontal_flip:
+    enable: true
+  rand_augment:
+    enable: true
+  random_erase:
+    enable: true
+    p: 0.25
+  mixup:
+    enable: true
+    alpha: 0.2
+  cutmix:
+    enable: true
+    alpha: 1.0
+  resize:
+    enable: true
+    size: 288 # shorter size is 288
+    interpolation: "bicubic"
+  center_crop:
+    enable: true
+    size: 256
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 256
+    crop_size_height: 256
+loss:
+  category: "classification"
+  classification:
+    name: "cross_entropy"
+    label_smoothing: 0.1
+optim:
+  name: "adamw"
+  weight_decay: 0.05
+  no_decay_bn_filter_bias: true
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 300
+  warmup_iterations: 20000
+  warmup_init_lr: 1.e-6
+  cosine:
+    max_lr: 0.002
+    min_lr: 0.0002
+model:
+  classification:
+    name: "mobilevit_v3"
+    mitv3:
+      width_multiplier: 1.00
+      attn_norm_layer: "layer_norm_2d"
+    activation:
+      name: "swish"
+  normalization:
+    name: "batch_norm"
+    momentum: 0.1
+  activation:
+    name: "swish"
+  layer:
+    global_pool: "mean"
+    conv_init:  "kaiming_normal"
+    conv_init_std_dev: 0.02
+    linear_init: "trunc_normal"
+    linear_init_std_dev: 0.02
+ema:
+  enable: true
+  momentum: 0.0005
+stats:
+  val: [ "loss", "top1", "top5" ]
+  train: ["loss"]
+  checkpoint_metric: "top1"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v2/config/detection/ssd_coco/mobilevit_v3.yaml b/MobileViTv3-v2/config/detection/ssd_coco/mobilevit_v3.yaml
new file mode 100644
index 0000000..ab2ef12
--- /dev/null
+++ b/MobileViTv3-v2/config/detection/ssd_coco/mobilevit_v3.yaml
@@ -0,0 +1,95 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 500
+  auto_resume: true
+  mixed_precision: true
+dataset:
+  root_train: "/mnt/vision_datasets/coco"
+  root_val: "/mnt/vision_datasets/coco"
+  name: "coco_ssd"
+  category: "detection"
+  train_batch_size0: 32 # effective batch size is 128 (32 * 4 GPUs)
+  val_batch_size0: 32
+  eval_batch_size0: 1
+  workers: 8
+  persistent_workers: false
+  pin_memory: true
+image_augmentation:
+  resize:
+    enable: true
+    size: [320, 320]
+    interpolation: "bicubic"
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 320
+    crop_size_height: 320
+loss:
+  category: "detection"
+  detection:
+    name: "ssd_multibox_loss"
+    ssd_multibox_loss:
+      neg_pos_ratio: 3
+optim:
+  name: "adamw"
+  weight_decay: 0.05
+  no_decay_bn_filter_bias: false
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 200
+  warmup_iterations: 500
+  warmup_init_lr: 0.00009
+  cosine:
+    max_lr: 0.0009
+    min_lr: 1.e-6
+anchor_generator:
+  name: "ssd"
+  ssd:
+    output_strides: [ 16, 32, 64, 128, 256, -1 ]
+    aspect_ratios: [ [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [ 2, 3 ], [2] ]
+    min_scale_ratio: 0.1
+    max_scale_ratio: 1.05
+matcher:
+  name: "ssd"
+  ssd:
+    center_variance: 0.1
+    size_variance: 0.2
+    iou_threshold: 0.5
+model:
+  detection:
+    name: "ssd"
+    ssd:
+      proj_channels: [512, 256, 256, 128, 128, 64]
+      nms_iou_threshold: 0.5
+  classification:
+    name: "mobilevit_v3"
+    mitv3:
+      width_multiplier: 2.0
+      attn_norm_layer: "layer_norm_2d"
+    activation:
+      name: "swish"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "relu"
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_normal"
+    linear_init: "normal"
+    conv_weight_std: false
+ema:
+  enable: true
+  momentum: 0.0005
+stats:
+  val: [ "loss", "coco_map"]
+  train: ["loss"]
+  checkpoint_metric: "coco_map"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v2/config/segmentation/ade20k/deeplabv3_mobilevitv3.yaml b/MobileViTv3-v2/config/segmentation/ade20k/deeplabv3_mobilevitv3.yaml
new file mode 100644
index 0000000..8e112ea
--- /dev/null
+++ b/MobileViTv3-v2/config/segmentation/ade20k/deeplabv3_mobilevitv3.yaml
@@ -0,0 +1,108 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 200
+  auto_resume: false
+  mixed_precision: true
+  grad_clip: 10.0
+dataset:
+  root_train: "/mnt/vision_datasets/ADEChallengeData2016/"
+  root_val: "/mnt/vision_datasets/ADEChallengeData2016/"
+  name: "ade20k"
+  category: "segmentation"
+  train_batch_size0: 4 # effective batch size is 16 ( 4 * 4 GPUs)
+  val_batch_size0: 4
+  eval_batch_size0: 1
+  workers: 4
+  persistent_workers: false
+  pin_memory: false
+image_augmentation:
+  random_crop:
+    enable: true
+    seg_class_max_ratio: 0.75
+    pad_if_needed: true
+    mask_fill: 0 # background idx is 0
+  random_horizontal_flip:
+    enable: true
+  resize:
+    enable: true
+    size: [512, 512]
+    interpolation: "bicubic"
+  random_short_size_resize:
+    enable: true
+    interpolation: "bicubic"
+    short_side_min: 256
+    short_side_max: 768
+    max_img_dim: 1024
+  photo_metric_distort:
+    enable: true
+  random_rotate:
+    enable: true
+    angle: 10
+    mask_fill: 0 # background idx is 0
+  random_gaussian_noise:
+    enable: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 512
+    crop_size_height: 512
+loss:
+  category: "segmentation"
+  ignore_idx: -1
+  segmentation:
+    name: "cross_entropy"
+    cross_entropy:
+      aux_weight: 0.4
+optim:
+  name: "sgd"
+  weight_decay: 1.e-4
+  no_decay_bn_filter_bias: true
+  sgd:
+    momentum: 0.9
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 120
+  cosine:
+    max_lr: 0.02
+    min_lr: 0.0002
+model:
+  segmentation:
+    name: "encoder_decoder"
+    lr_multiplier: 1
+    seg_head: "deeplabv3"
+    output_stride: 8
+    use_aux_head: true
+    activation:
+      name: "relu"
+    deeplabv3:
+      aspp_dropout: 0.1
+      aspp_out_channels: 512
+      aspp_rates: [ 12, 24, 36 ]
+  classification:
+    name: "mobilevit_v3"
+    mitv3:
+      width_multiplier: 1.0
+      attn_norm_layer: "layer_norm_2d"
+    activation:
+      name: "swish"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "swish"
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_uniform"
+    linear_init: "normal"
+ema:
+  enable: true
+  momentum: 0.0005
+stats:
+  val: [ "loss", "iou" ]
+  train: [ "loss", "grad_norm" ]
+  checkpoint_metric: "iou"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v2/config/segmentation/ade20k/pspnet_mobilevitv3.yaml b/MobileViTv3-v2/config/segmentation/ade20k/pspnet_mobilevitv3.yaml
new file mode 100644
index 0000000..9592828
--- /dev/null
+++ b/MobileViTv3-v2/config/segmentation/ade20k/pspnet_mobilevitv3.yaml
@@ -0,0 +1,108 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 200
+  auto_resume: false
+  mixed_precision: true
+  grad_clip: 10.0
+dataset:
+  root_train: "/mnt/vision_datasets/ADEChallengeData2016/"
+  root_val: "/mnt/vision_datasets/ADEChallengeData2016/"
+  name: "ade20k"
+  category: "segmentation"
+  train_batch_size0: 4 # effective batch size is 16 ( 4 * 4 GPUs)
+  val_batch_size0: 4
+  eval_batch_size0: 1
+  workers: 4
+  persistent_workers: false
+  pin_memory: false
+image_augmentation:
+  random_crop:
+    enable: true
+    seg_class_max_ratio: 0.75
+    pad_if_needed: true
+    mask_fill: 0 # background idx is 0
+  random_horizontal_flip:
+    enable: true
+  resize:
+    enable: true
+    size: [512, 512]
+    interpolation: "bicubic"
+  random_short_size_resize:
+    enable: true
+    interpolation: "bicubic"
+    short_side_min: 256
+    short_side_max: 768
+    max_img_dim: 1024
+  photo_metric_distort:
+    enable: true
+  random_rotate:
+    enable: true
+    angle: 10
+    mask_fill: 0 # background idx is 0
+  random_gaussian_noise:
+    enable: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 512
+    crop_size_height: 512
+loss:
+  category: "segmentation"
+  ignore_idx: -1
+  segmentation:
+    name: "cross_entropy"
+    cross_entropy:
+      aux_weight: 0.4
+optim:
+  name: "sgd"
+  weight_decay: 1.e-4
+  no_decay_bn_filter_bias: true
+  sgd:
+    momentum: 0.9
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 120
+  cosine:
+    max_lr: 0.02
+    min_lr: 0.0002
+model:
+  segmentation:
+    name: "encoder_decoder"
+    lr_multiplier: 1
+    seg_head: "pspnet"
+    output_stride: 8
+    use_aux_head: true
+    activation:
+      name: "relu"
+    pspnet:
+      psp_dropout: 0.1
+      psp_out_channels: 512
+      psp_pool_sizes: [ 1, 2, 3, 6 ]
+  classification:
+    name: "mobilevit_v3"
+    mitv3:
+      width_multiplier: 1.0
+      attn_norm_layer: "layer_norm_2d"
+    activation:
+      name: "swish"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "swish"
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_uniform"
+    linear_init: "normal"
+ema:
+  enable: true
+  momentum: 0.0005
+stats:
+  val: [ "loss", "iou" ]
+  train: [ "loss", "grad_norm" ]
+  checkpoint_metric: "iou"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v2/config/segmentation/pascal_voc/deeplabv3_mobilevitv3.yaml b/MobileViTv3-v2/config/segmentation/pascal_voc/deeplabv3_mobilevitv3.yaml
new file mode 100644
index 0000000..50ca3e9
--- /dev/null
+++ b/MobileViTv3-v2/config/segmentation/pascal_voc/deeplabv3_mobilevitv3.yaml
@@ -0,0 +1,114 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 200
+  auto_resume: false
+  mixed_precision: true
+  grad_clip: 10.0
+dataset:
+  root_train: "/mnt/vision_datasets/pascal_voc/VOCdevkit/"
+  root_val: "/mnt/vision_datasets/pascal_voc/VOCdevkit/"
+  name: "pascal"
+  category: "segmentation"
+  train_batch_size0: 32 # effective batch size is 128 (32 * 4 GPUs)
+  val_batch_size0: 16
+  eval_batch_size0: 1
+  workers: 8
+  persistent_workers: false
+  pin_memory: false
+  pascal:
+    use_coco_data: true
+    coco_root_dir: "/mnt/vision_datasets/coco_preprocess"
+image_augmentation:
+  random_crop:
+    enable: true
+    seg_class_max_ratio: 0.75
+    pad_if_needed: true
+    mask_fill: 255 # background idx is 255
+  random_horizontal_flip:
+    enable: true
+  resize:
+    enable: true
+    size: [512, 512]
+    interpolation: "bicubic"
+  random_short_size_resize:
+    enable: true
+    interpolation: "bicubic"
+    short_side_min: 256
+    short_side_max: 768
+    max_img_dim: 1024
+  photo_metric_distort:
+    enable: true
+  random_rotate:
+    enable: true
+    angle: 10
+    mask_fill: 255 # background idx is 255
+  random_gaussian_noise:
+    enable: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 512
+    crop_size_height: 512
+loss:
+  category: "segmentation"
+  ignore_idx: 255
+  segmentation:
+    name: "cross_entropy"
+    cross_entropy:
+      aux_weight: 0.4
+optim:
+  name: "adamw"
+  weight_decay: 0.05 #0.01
+  no_decay_bn_filter_bias: true
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 50
+  warmup_iterations: 500
+  warmup_init_lr: 0.00005
+  cosine:
+    max_lr: 0.0005
+    min_lr: 1.e-6
+model:
+  segmentation:
+    name: "encoder_decoder"
+    lr_multiplier: 1
+    seg_head: "deeplabv3"
+    output_stride: 16
+    use_aux_head: true
+    activation:
+      name: "relu"
+    deeplabv3:
+      aspp_dropout: 0.1
+      aspp_out_channels: 512
+      aspp_rates: [ 6, 12, 18 ]
+  classification:
+    name: "mobilevit_v3"
+    mitv3:
+      width_multiplier: 1.0
+      attn_norm_layer: "layer_norm_2d"
+    activation:
+      name: "swish"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "swish"
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_uniform"
+    linear_init: "normal"
+ema:
+  enable: true
+  momentum: 0.0005
+stats:
+  val: [ "loss", "iou" ]
+  train: [ "loss", "grad_norm" ]
+  checkpoint_metric: "iou"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v2/config/segmentation/pascal_voc/pspnet_mobilevitv3.yaml b/MobileViTv3-v2/config/segmentation/pascal_voc/pspnet_mobilevitv3.yaml
new file mode 100644
index 0000000..3aa0e6d
--- /dev/null
+++ b/MobileViTv3-v2/config/segmentation/pascal_voc/pspnet_mobilevitv3.yaml
@@ -0,0 +1,114 @@
+common:
+  run_label: "run_1"
+  accum_freq: 1
+  accum_after_epoch: -1
+  log_freq: 200
+  auto_resume: false
+  mixed_precision: true
+  grad_clip: 10.0
+dataset:
+  root_train: "/mnt/vision_datasets/pascal_voc/VOCdevkit/"
+  root_val: "/mnt/vision_datasets/pascal_voc/VOCdevkit/"
+  name: "pascal"
+  category: "segmentation"
+  train_batch_size0: 32 # effective batch size is 128 (32 * 4 GPUs)
+  val_batch_size0: 16
+  eval_batch_size0: 1
+  workers: 8
+  persistent_workers: false
+  pin_memory: false
+  pascal:
+    use_coco_data: true
+    coco_root_dir: "/mnt/vision_datasets/coco_preprocess"
+image_augmentation:
+  random_crop:
+    enable: true
+    seg_class_max_ratio: 0.75
+    pad_if_needed: true
+    mask_fill: 255 # background idx is 255
+  random_horizontal_flip:
+    enable: true
+  resize:
+    enable: true
+    size: [512, 512]
+    interpolation: "bicubic"
+  random_short_size_resize:
+    enable: true
+    interpolation: "bicubic"
+    short_side_min: 256
+    short_side_max: 768
+    max_img_dim: 1024
+  photo_metric_distort:
+    enable: true
+  random_rotate:
+    enable: true
+    angle: 10
+    mask_fill: 255 # background idx is 255
+  random_gaussian_noise:
+    enable: true
+sampler:
+  name: "batch_sampler"
+  bs:
+    crop_size_width: 512
+    crop_size_height: 512
+loss:
+  category: "segmentation"
+  ignore_idx: 255
+  segmentation:
+    name: "cross_entropy"
+    cross_entropy:
+      aux_weight: 0.4
+optim:
+  name: "adamw"
+  weight_decay: 0.05
+  no_decay_bn_filter_bias: true
+  adamw:
+    beta1: 0.9
+    beta2: 0.999
+scheduler:
+  name: "cosine"
+  is_iteration_based: false
+  max_epochs: 50
+  warmup_iterations: 500
+  warmup_init_lr: 0.00005
+  cosine:
+    max_lr: 0.0005
+    min_lr: 1.e-6
+model:
+  segmentation:
+    name: "encoder_decoder"
+    lr_multiplier: 1
+    seg_head: "pspnet"
+    output_stride: 16
+    use_aux_head: true
+    activation:
+      name: "relu"
+    pspnet:
+      psp_dropout: 0.1
+      psp_out_channels: 512
+      psp_pool_sizes: [ 1, 2, 3, 6 ]
+  classification:
+    name: "mobilevit_v3"
+    mitv3:
+      width_multiplier: 1.0
+      attn_norm_layer: "layer_norm_2d"
+    activation:
+      name: "swish"
+  normalization:
+    name: "sync_batch_norm"
+    momentum: 0.1
+  activation:
+    name: "swish"
+    inplace: false
+  layer:
+    global_pool: "mean"
+    conv_init: "kaiming_uniform"
+    linear_init: "normal"
+ema:
+  enable: true
+  momentum: 0.0005
+stats:
+  val: [ "loss", "iou" ]
+  train: [ "loss", "grad_norm" ]
+  checkpoint_metric: "iou"
+  checkpoint_metric_max: true
diff --git a/MobileViTv3-v2/cvnets/models/classification/config/mobilevit_v3.py b/MobileViTv3-v2/cvnets/models/classification/config/mobilevit_v3.py
new file mode 100644
index 0000000..013d641
--- /dev/null
+++ b/MobileViTv3-v2/cvnets/models/classification/config/mobilevit_v3.py
@@ -0,0 +1,76 @@
+# For licensing see accompanying LICENSE file.
+
+import math
+from typing import Dict, Sequence
+
+from utils import logger
+from utils.math_utils import make_divisible, bound_fn
+
+
+def get_configuration(opts) -> Dict:
+
+    width_multiplier = getattr(opts, "model.classification.mitv3.width_multiplier", 1.0)
+
+    ffn_multiplier = (
+        2  # bound_fn(min_val=2.0, max_val=4.0, value=2.0 * width_multiplier)
+    )
+    mv2_exp_mult = 2  # max(1.0, min(2.0, 2.0 * width_multiplier))
+
+    layer_0_dim = bound_fn(min_val=16, max_val=64, value=32 * width_multiplier)
+    layer_0_dim = int(make_divisible(layer_0_dim, divisor=8, min_value=16))
+    config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": layer_0_dim,
+        },
+        "layer1": {
+            "out_channels": int(make_divisible(64 * width_multiplier, divisor=16)),
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": int(make_divisible(128 * width_multiplier, divisor=8)),
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": int(make_divisible(256 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(128 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": int(make_divisible(384 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(192 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": int(make_divisible(512 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(256 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    return config
diff --git a/MobileViTv3-v2/cvnets/models/classification/mobilevit_v3.py b/MobileViTv3-v2/cvnets/models/classification/mobilevit_v3.py
new file mode 100644
index 0000000..8bd651d
--- /dev/null
+++ b/MobileViTv3-v2/cvnets/models/classification/mobilevit_v3.py
@@ -0,0 +1,226 @@
+# For licensing see accompanying LICENSE file.
+
+from torch import nn
+import argparse
+from typing import Dict, Tuple, Optional
+
+from . import register_cls_models
+from .base_cls import BaseEncoder
+from .config.mobilevit_v3 import get_configuration
+from ...layers import ConvLayer, LinearLayer, GlobalPool, Identity
+from ...modules import InvertedResidual
+from ...modules import MobileViTBlockv3 as Block
+
+
+
+
+@register_cls_models("mobilevit_v3")
+class MobileViTv3(BaseEncoder):
+    """
+    This class defines the MobileViTv3 architecture
+    """
+
+    def __init__(self, opts, *args, **kwargs) -> None:
+        num_classes = getattr(opts, "model.classification.n_classes", 1000)
+        pool_type = getattr(opts, "model.layer.global_pool", "mean")
+
+        mobilevit_config = get_configuration(opts=opts)
+        image_channels = mobilevit_config["layer0"]["img_channels"]
+        out_channels = mobilevit_config["layer0"]["out_channels"]
+
+        super().__init__(*args, **kwargs)
+
+        # store model configuration in a dictionary
+        self.model_conf_dict = dict()
+        self.conv_1 = ConvLayer(
+            opts=opts,
+            in_channels=image_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            use_norm=True,
+            use_act=True,
+        )
+
+        self.model_conf_dict["conv1"] = {"in": image_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_1, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer1"]
+        )
+        self.model_conf_dict["layer1"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_2, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer2"]
+        )
+        self.model_conf_dict["layer2"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_3, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer3"]
+        )
+        self.model_conf_dict["layer3"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_4, out_channels = self._make_layer(
+            opts=opts,
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer4"],
+            dilate=self.dilate_l4,
+        )
+        self.model_conf_dict["layer4"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_5, out_channels = self._make_layer(
+            opts=opts,
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer5"],
+            dilate=self.dilate_l5,
+        )
+        self.model_conf_dict["layer5"] = {"in": in_channels, "out": out_channels}
+
+        self.conv_1x1_exp = Identity()
+        self.model_conf_dict["exp_before_cls"] = {
+            "in": out_channels,
+            "out": out_channels,
+        }
+
+        self.classifier = nn.Sequential(
+            GlobalPool(pool_type=pool_type, keep_dim=False),
+            LinearLayer(in_features=out_channels, out_features=num_classes, bias=True),
+        )
+
+        # check model
+        self.check_model()
+
+        # weight initialization
+        self.reset_parameters(opts=opts)
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        group = parser.add_argument_group(
+            title="".format(cls.__name__), description="".format(cls.__name__)
+        )
+        group.add_argument(
+            "--model.classification.mitv3.attn-dropout",
+            type=float,
+            default=0.0,
+            help="Dropout in attention layer. Defaults to 0.0",
+        )
+        group.add_argument(
+            "--model.classification.mitv3.ffn-dropout",
+            type=float,
+            default=0.0,
+            help="Dropout between FFN layers. Defaults to 0.0",
+        )
+        group.add_argument(
+            "--model.classification.mitv3.dropout",
+            type=float,
+            default=0.0,
+            help="Dropout in attention layer. Defaults to 0.0",
+        )
+        group.add_argument(
+            "--model.classification.mitv3.width-multiplier",
+            type=float,
+            default=1.0,
+            help="Width multiplier. Defaults to 1.0",
+        )
+        group.add_argument(
+            "--model.classification.mitv3.attn-norm-layer",
+            type=str,
+            default="layer_norm_2d",
+            help="Norm layer in attention block. Defaults to LayerNorm",
+        )
+        return parser
+
+    def _make_layer(
+        self, opts, input_channel, cfg: Dict, dilate: Optional[bool] = False
+    ) -> Tuple[nn.Sequential, int]:
+        block_type = cfg.get("block_type", "mobilevit")
+        if block_type.lower() == "mobilevit":
+            return self._make_mit_layer(
+                opts=opts, input_channel=input_channel, cfg=cfg, dilate=dilate
+            )
+        else:
+            return self._make_mobilenet_layer(
+                opts=opts, input_channel=input_channel, cfg=cfg
+            )
+
+    @staticmethod
+    def _make_mobilenet_layer(
+        opts, input_channel: int, cfg: Dict
+    ) -> Tuple[nn.Sequential, int]:
+        output_channels = cfg.get("out_channels")
+        num_blocks = cfg.get("num_blocks", 2)
+        expand_ratio = cfg.get("expand_ratio", 4)
+        block = []
+
+        for i in range(num_blocks):
+            stride = cfg.get("stride", 1) if i == 0 else 1
+
+            layer = InvertedResidual(
+                opts=opts,
+                in_channels=input_channel,
+                out_channels=output_channels,
+                stride=stride,
+                expand_ratio=expand_ratio,
+            )
+            block.append(layer)
+            input_channel = output_channels
+        return nn.Sequential(*block), input_channel
+
+    def _make_mit_layer(
+        self, opts, input_channel, cfg: Dict, dilate: Optional[bool] = False
+    ) -> Tuple[nn.Sequential, int]:
+        prev_dilation = self.dilation
+        block = []
+        stride = cfg.get("stride", 1)
+
+        if stride == 2:
+            if dilate:
+                self.dilation *= 2
+                stride = 1
+
+            layer = InvertedResidual(
+                opts=opts,
+                in_channels=input_channel,
+                out_channels=cfg.get("out_channels"),
+                stride=stride,
+                expand_ratio=cfg.get("mv_expand_ratio", 4),
+                dilation=prev_dilation,
+            )
+
+            block.append(layer)
+            input_channel = cfg.get("out_channels")
+
+        attn_unit_dim = cfg["attn_unit_dim"]
+        ffn_multiplier = cfg.get("ffn_multiplier")
+
+        dropout = getattr(opts, "model.classification.mitv3.dropout", 0.0)
+
+        block.append(
+            Block(
+                opts=opts,
+                in_channels=input_channel,
+                attn_unit_dim=attn_unit_dim,
+                ffn_multiplier=ffn_multiplier,
+                n_attn_blocks=cfg.get("attn_blocks", 1),
+                patch_h=cfg.get("patch_h", 2),
+                patch_w=cfg.get("patch_w", 2),
+                dropout=dropout,
+                ffn_dropout=getattr(
+                    opts, "model.classification.mitv3.ffn_dropout", 0.0
+                ),
+                attn_dropout=getattr(
+                    opts, "model.classification.mitv3.attn_dropout", 0.0
+                ),
+                conv_ksize=3,
+                attn_norm_layer=getattr(
+                    opts, "model.classification.mitv3.attn_norm_layer", "layer_norm_2d"
+                ),
+                dilation=self.dilation,
+            )
+        )
+
+        return nn.Sequential(*block), input_channel
diff --git a/MobileViTv3-v2/cvnets/modules/__init__.py b/MobileViTv3-v2/cvnets/modules/__init__.py
new file mode 100644
index 0000000..3e3ec0e
--- /dev/null
+++ b/MobileViTv3-v2/cvnets/modules/__init__.py
@@ -0,0 +1,30 @@
+# For licensing see accompanying LICENSE file.
+
+from .base_module import BaseModule
+from .squeeze_excitation import SqueezeExcitation
+from .mobilenetv2 import InvertedResidual, InvertedResidualSE
+from .resnet_modules import BasicResNetBlock, BottleneckResNetBlock
+from .aspp_block import ASPP
+from .transformer import TransformerEncoder
+from .pspnet_module import PSP
+from .mobilevit_block import MobileViTBlock, MobileViTBlockv2, MobileViTBlockv3
+from .feature_pyramid import FeaturePyramidNetwork
+from .ssd_heads import SSDHead, SSDInstanceHead
+
+
+__all__ = [
+    "InvertedResidual",
+    "InvertedResidualSE",
+    "BasicResNetBlock",
+    "BottleneckResNetBlock",
+    "ASPP",
+    "TransformerEncoder",
+    "SqueezeExcitation",
+    "PSP",
+    "MobileViTBlock",
+    "MobileViTBlockv2",
+    "MobileViTBlockv3",
+    "FeaturePyramidNetwork",
+    "SSDHead",
+    "SSDInstanceHead",
+]
diff --git a/MobileViTv3-v2/cvnets/modules/mobilevit_block.py b/MobileViTv3-v2/cvnets/modules/mobilevit_block.py
new file mode 100644
index 0000000..65d3ce7
--- /dev/null
+++ b/MobileViTv3-v2/cvnets/modules/mobilevit_block.py
@@ -0,0 +1,1107 @@
+# For licensing see accompanying LICENSE file.
+
+import numpy as np
+from torch import nn, Tensor
+import math
+import torch
+from torch.nn import functional as F
+from typing import Optional, Dict, Tuple, Union, Sequence
+
+from .transformer import TransformerEncoder, LinearAttnFFN
+from .base_module import BaseModule
+from ..misc.profiler import module_profile
+from ..layers import ConvLayer, get_normalization_layer
+
+
+class MobileViTBlock(BaseModule):
+    """
+    This class defines the `MobileViT block <https://arxiv.org/abs/2110.02178?context=cs.LG>`_
+
+    Args:
+        opts: command line arguments
+        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H, W)`
+        transformer_dim (int): Input dimension to the transformer unit
+        ffn_dim (int): Dimension of the FFN block
+        n_transformer_blocks (Optional[int]): Number of transformer blocks. Default: 2
+        head_dim (Optional[int]): Head dimension in the multi-head attention. Default: 32
+        attn_dropout (Optional[float]): Dropout in multi-head attention. Default: 0.0
+        dropout (Optional[float]): Dropout rate. Default: 0.0
+        ffn_dropout (Optional[float]): Dropout between FFN layers in transformer. Default: 0.0
+        patch_h (Optional[int]): Patch height for unfolding operation. Default: 8
+        patch_w (Optional[int]): Patch width for unfolding operation. Default: 8
+        transformer_norm_layer (Optional[str]): Normalization layer in the transformer block. Default: layer_norm
+        conv_ksize (Optional[int]): Kernel size to learn local representations in MobileViT block. Default: 3
+        dilation (Optional[int]): Dilation rate in convolutions. Default: 1
+        no_fusion (Optional[bool]): Do not combine the input and output feature maps. Default: False
+    """
+
+    def __init__(
+        self,
+        opts,
+        in_channels: int,
+        transformer_dim: int,
+        ffn_dim: int,
+        n_transformer_blocks: Optional[int] = 2,
+        head_dim: Optional[int] = 32,
+        attn_dropout: Optional[float] = 0.0,
+        dropout: Optional[int] = 0.0,
+        ffn_dropout: Optional[int] = 0.0,
+        patch_h: Optional[int] = 8,
+        patch_w: Optional[int] = 8,
+        transformer_norm_layer: Optional[str] = "layer_norm",
+        conv_ksize: Optional[int] = 3,
+        dilation: Optional[int] = 1,
+        no_fusion: Optional[bool] = False,
+        *args,
+        **kwargs
+    ) -> None:
+        conv_3x3_in = ConvLayer(
+            opts=opts,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=conv_ksize,
+            stride=1,
+            use_norm=True,
+            use_act=True,
+            dilation=dilation,
+        )
+        conv_1x1_in = ConvLayer(
+            opts=opts,
+            in_channels=in_channels,
+            out_channels=transformer_dim,
+            kernel_size=1,
+            stride=1,
+            use_norm=False,
+            use_act=False,
+        )
+
+        conv_1x1_out = ConvLayer(
+            opts=opts,
+            in_channels=transformer_dim,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            use_norm=True,
+            use_act=True,
+        )
+        conv_3x3_out = None
+        if not no_fusion:
+            conv_3x3_out = ConvLayer(
+                opts=opts,
+                in_channels=2 * in_channels,
+                out_channels=in_channels,
+                kernel_size=conv_ksize,
+                stride=1,
+                use_norm=True,
+                use_act=True,
+            )
+        super().__init__()
+        self.local_rep = nn.Sequential()
+        self.local_rep.add_module(name="conv_3x3", module=conv_3x3_in)
+        self.local_rep.add_module(name="conv_1x1", module=conv_1x1_in)
+
+        assert transformer_dim % head_dim == 0
+        num_heads = transformer_dim // head_dim
+
+        global_rep = [
+            TransformerEncoder(
+                opts=opts,
+                embed_dim=transformer_dim,
+                ffn_latent_dim=ffn_dim,
+                num_heads=num_heads,
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                transformer_norm_layer=transformer_norm_layer,
+            )
+            for _ in range(n_transformer_blocks)
+        ]
+        global_rep.append(
+            get_normalization_layer(
+                opts=opts,
+                norm_type=transformer_norm_layer,
+                num_features=transformer_dim,
+            )
+        )
+        self.global_rep = nn.Sequential(*global_rep)
+
+        self.conv_proj = conv_1x1_out
+
+        self.fusion = conv_3x3_out
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+        self.patch_area = self.patch_w * self.patch_h
+
+        self.cnn_in_dim = in_channels
+        self.cnn_out_dim = transformer_dim
+        self.n_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.ffn_dropout = ffn_dropout
+        self.dilation = dilation
+        self.n_blocks = n_transformer_blocks
+        self.conv_ksize = conv_ksize
+
+    def __repr__(self) -> str:
+        repr_str = "{}(".format(self.__class__.__name__)
+
+        repr_str += "\n\t Local representations"
+        if isinstance(self.local_rep, nn.Sequential):
+            for m in self.local_rep:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.local_rep)
+
+        repr_str += "\n\t Global representations with patch size of {}x{}".format(
+            self.patch_h, self.patch_w
+        )
+        if isinstance(self.global_rep, nn.Sequential):
+            for m in self.global_rep:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.global_rep)
+
+        if isinstance(self.conv_proj, nn.Sequential):
+            for m in self.conv_proj:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.conv_proj)
+
+        if self.fusion is not None:
+            repr_str += "\n\t Feature fusion"
+            if isinstance(self.fusion, nn.Sequential):
+                for m in self.fusion:
+                    repr_str += "\n\t\t {}".format(m)
+            else:
+                repr_str += "\n\t\t {}".format(self.fusion)
+
+        repr_str += "\n)"
+        return repr_str
+
+    def unfolding(self, feature_map: Tensor) -> Tuple[Tensor, Dict]:
+        patch_w, patch_h = self.patch_w, self.patch_h
+        patch_area = int(patch_w * patch_h)
+        batch_size, in_channels, orig_h, orig_w = feature_map.shape
+
+        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+
+        interpolate = False
+        if new_w != orig_w or new_h != orig_h:
+            # Note: Padding can be done, but then it needs to be handled in attention function.
+            feature_map = F.interpolate(
+                feature_map, size=(new_h, new_w), mode="bilinear", align_corners=False
+            )
+            interpolate = True
+
+        # number of patches along width and height
+        num_patch_w = new_w // patch_w  # n_w
+        num_patch_h = new_h // patch_h  # n_h
+        num_patches = num_patch_h * num_patch_w  # N
+
+        # [B, C, H, W] --> [B * C * n_h, p_h, n_w, p_w]
+        reshaped_fm = feature_map.reshape(
+            batch_size * in_channels * num_patch_h, patch_h, num_patch_w, patch_w
+        )
+        # [B * C * n_h, p_h, n_w, p_w] --> [B * C * n_h, n_w, p_h, p_w]
+        transposed_fm = reshaped_fm.transpose(1, 2)
+        # [B * C * n_h, n_w, p_h, p_w] --> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
+        reshaped_fm = transposed_fm.reshape(
+            batch_size, in_channels, num_patches, patch_area
+        )
+        # [B, C, N, P] --> [B, P, N, C]
+        transposed_fm = reshaped_fm.transpose(1, 3)
+        # [B, P, N, C] --> [BP, N, C]
+        patches = transposed_fm.reshape(batch_size * patch_area, num_patches, -1)
+
+        info_dict = {
+            "orig_size": (orig_h, orig_w),
+            "batch_size": batch_size,
+            "interpolate": interpolate,
+            "total_patches": num_patches,
+            "num_patches_w": num_patch_w,
+            "num_patches_h": num_patch_h,
+        }
+
+        return patches, info_dict
+
+    def folding(self, patches: Tensor, info_dict: Dict) -> Tensor:
+        n_dim = patches.dim()
+        assert n_dim == 3, "Tensor should be of shape BPxNxC. Got: {}".format(
+            patches.shape
+        )
+        # [BP, N, C] --> [B, P, N, C]
+        patches = patches.contiguous().view(
+            info_dict["batch_size"], self.patch_area, info_dict["total_patches"], -1
+        )
+
+        batch_size, pixels, num_patches, channels = patches.size()
+        num_patch_h = info_dict["num_patches_h"]
+        num_patch_w = info_dict["num_patches_w"]
+
+        # [B, P, N, C] --> [B, C, N, P]
+        patches = patches.transpose(1, 3)
+
+        # [B, C, N, P] --> [B*C*n_h, n_w, p_h, p_w]
+        feature_map = patches.reshape(
+            batch_size * channels * num_patch_h, num_patch_w, self.patch_h, self.patch_w
+        )
+        # [B*C*n_h, n_w, p_h, p_w] --> [B*C*n_h, p_h, n_w, p_w]
+        feature_map = feature_map.transpose(1, 2)
+        # [B*C*n_h, p_h, n_w, p_w] --> [B, C, H, W]
+        feature_map = feature_map.reshape(
+            batch_size, channels, num_patch_h * self.patch_h, num_patch_w * self.patch_w
+        )
+        if info_dict["interpolate"]:
+            feature_map = F.interpolate(
+                feature_map,
+                size=info_dict["orig_size"],
+                mode="bilinear",
+                align_corners=False,
+            )
+        return feature_map
+
+    def forward_spatial(self, x: Tensor) -> Tensor:
+        res = x
+
+        fm = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, info_dict = self.unfolding(fm)
+
+        # learn global representations
+        for transformer_layer in self.global_rep:
+            patches = transformer_layer(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, info_dict=info_dict)
+
+        fm = self.conv_proj(fm)
+
+        if self.fusion is not None:
+            fm = self.fusion(torch.cat((res, fm), dim=1))
+        return fm
+
+    def forward_temporal(
+        self, x: Tensor, x_prev: Optional[Tensor] = None
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+
+        res = x
+        fm = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, info_dict = self.unfolding(fm)
+
+        # learn global representations
+        for global_layer in self.global_rep:
+            if isinstance(global_layer, TransformerEncoder):
+                patches = global_layer(x=patches, x_prev=x_prev)
+            else:
+                patches = global_layer(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, info_dict=info_dict)
+
+        fm = self.conv_proj(fm)
+
+        if self.fusion is not None:
+            fm = self.fusion(torch.cat((res, fm), dim=1))
+        return fm, patches
+
+    def forward(
+        self, x: Union[Tensor, Tuple[Tensor]], *args, **kwargs
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        if isinstance(x, Tuple) and len(x) == 2:
+            # for spatio-temporal MobileViT
+            return self.forward_temporal(x=x[0], x_prev=x[1])
+        elif isinstance(x, Tensor):
+            # For image data
+            return self.forward_spatial(x)
+        else:
+            raise NotImplementedError
+
+    def profile_module(
+        self, input: Tensor, *args, **kwargs
+    ) -> Tuple[Tensor, float, float]:
+        params = macs = 0.0
+
+        res = input
+        out, p, m = module_profile(module=self.local_rep, x=input)
+        params += p
+        macs += m
+
+        patches, info_dict = self.unfolding(feature_map=out)
+
+        patches, p, m = module_profile(module=self.global_rep, x=patches)
+        params += p
+        macs += m
+
+        fm = self.folding(patches=patches, info_dict=info_dict)
+
+        out, p, m = module_profile(module=self.conv_proj, x=fm)
+        params += p
+        macs += m
+
+        if self.fusion is not None:
+            out, p, m = module_profile(
+                module=self.fusion, x=torch.cat((out, res), dim=1)
+            )
+            params += p
+            macs += m
+
+        return res, params, macs
+
+# TODO: Add reference to MobileViTv2 paper
+
+
+class MobileViTBlockv2(BaseModule):
+    """
+    This class defines the `MobileViTv2 block <>`_
+
+    Args:
+        opts: command line arguments
+        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H, W)`
+        attn_unit_dim (int): Input dimension to the attention unit
+        ffn_multiplier (int): Expand the input dimensions by this factor in FFN. Default is 2.
+        n_attn_blocks (Optional[int]): Number of attention units. Default: 2
+        attn_dropout (Optional[float]): Dropout in multi-head attention. Default: 0.0
+        dropout (Optional[float]): Dropout rate. Default: 0.0
+        ffn_dropout (Optional[float]): Dropout between FFN layers in transformer. Default: 0.0
+        patch_h (Optional[int]): Patch height for unfolding operation. Default: 8
+        patch_w (Optional[int]): Patch width for unfolding operation. Default: 8
+        conv_ksize (Optional[int]): Kernel size to learn local representations in MobileViT block. Default: 3
+        dilation (Optional[int]): Dilation rate in convolutions. Default: 1
+        attn_norm_layer (Optional[str]): Normalization layer in the attention block. Default: layer_norm_2d
+    """
+
+    def __init__(
+        self,
+        opts,
+        in_channels: int,
+        attn_unit_dim: int,
+        ffn_multiplier: Optional[Union[Sequence[Union[int, float]], int, float]] = 2.0,
+        n_attn_blocks: Optional[int] = 2,
+        attn_dropout: Optional[float] = 0.0,
+        dropout: Optional[float] = 0.0,
+        ffn_dropout: Optional[float] = 0.0,
+        patch_h: Optional[int] = 8,
+        patch_w: Optional[int] = 8,
+        conv_ksize: Optional[int] = 3,
+        dilation: Optional[int] = 1,
+        attn_norm_layer: Optional[str] = "layer_norm_2d",
+        *args,
+        **kwargs
+    ) -> None:
+        cnn_out_dim = attn_unit_dim
+
+        conv_3x3_in = ConvLayer(
+            opts=opts,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=conv_ksize,
+            stride=1,
+            use_norm=True,
+            use_act=True,
+            dilation=dilation,
+            groups=in_channels,
+        )
+        conv_1x1_in = ConvLayer(
+            opts=opts,
+            in_channels=in_channels,
+            out_channels=cnn_out_dim,
+            kernel_size=1,
+            stride=1,
+            use_norm=False,
+            use_act=False,
+        )
+
+        super(MobileViTBlockv2, self).__init__()
+        self.local_rep = nn.Sequential(conv_3x3_in, conv_1x1_in)
+
+        self.global_rep, attn_unit_dim = self._build_attn_layer(
+            opts=opts,
+            d_model=attn_unit_dim,
+            ffn_mult=ffn_multiplier,
+            n_layers=n_attn_blocks,
+            attn_dropout=attn_dropout,
+            dropout=dropout,
+            ffn_dropout=ffn_dropout,
+            attn_norm_layer=attn_norm_layer,
+        )
+
+        self.conv_proj = ConvLayer(
+            opts=opts,
+            in_channels=cnn_out_dim,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            use_norm=True,
+            use_act=False,
+        )
+
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+        self.patch_area = self.patch_w * self.patch_h
+
+        self.cnn_in_dim = in_channels
+        self.cnn_out_dim = cnn_out_dim
+        self.transformer_in_dim = attn_unit_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.ffn_dropout = ffn_dropout
+        self.n_blocks = n_attn_blocks
+        self.conv_ksize = conv_ksize
+        self.enable_coreml_compatible_fn = getattr(
+            opts, "common.enable_coreml_compatible_module", False
+        )
+
+        if self.enable_coreml_compatible_fn:
+            # we set persistent to false so that these weights are not part of model's state_dict
+            self.register_buffer(
+                name="unfolding_weights",
+                tensor=self._compute_unfolding_weights(),
+                persistent=False,
+            )
+
+    def _compute_unfolding_weights(self) -> Tensor:
+        # [P_h * P_w, P_h * P_w]
+        weights = torch.eye(self.patch_h * self.patch_w, dtype=torch.float)
+        # [P_h * P_w, P_h * P_w] --> [P_h * P_w, 1, P_h, P_w]
+        weights = weights.reshape(
+            (self.patch_h * self.patch_w, 1, self.patch_h, self.patch_w)
+        )
+        # [P_h * P_w, 1, P_h, P_w] --> [P_h * P_w * C, 1, P_h, P_w]
+        weights = weights.repeat(self.cnn_out_dim, 1, 1, 1)
+        return weights
+
+    def _build_attn_layer(
+        self,
+        opts,
+        d_model: int,
+        ffn_mult: Union[Sequence, int, float],
+        n_layers: int,
+        attn_dropout: float,
+        dropout: float,
+        ffn_dropout: float,
+        attn_norm_layer: str,
+        *args,
+        **kwargs
+    ) -> Tuple[nn.Module, int]:
+
+        if isinstance(ffn_mult, Sequence) and len(ffn_mult) == 2:
+            ffn_dims = (
+                np.linspace(ffn_mult[0], ffn_mult[1], n_layers, dtype=float) * d_model
+            )
+        elif isinstance(ffn_mult, Sequence) and len(ffn_mult) == 1:
+            ffn_dims = [ffn_mult[0] * d_model] * n_layers
+        elif isinstance(ffn_mult, (int, float)):
+            ffn_dims = [ffn_mult * d_model] * n_layers
+        else:
+            raise NotImplementedError
+
+        # ensure that dims are multiple of 16
+        ffn_dims = [int((d // 16) * 16) for d in ffn_dims]
+
+        global_rep = [
+            LinearAttnFFN(
+                opts=opts,
+                embed_dim=d_model,
+                ffn_latent_dim=ffn_dims[block_idx],
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                norm_layer=attn_norm_layer,
+            )
+            for block_idx in range(n_layers)
+        ]
+        global_rep.append(
+            get_normalization_layer(
+                opts=opts, norm_type=attn_norm_layer, num_features=d_model
+            )
+        )
+
+        return nn.Sequential(*global_rep), d_model
+
+    def __repr__(self) -> str:
+        repr_str = "{}(".format(self.__class__.__name__)
+
+        repr_str += "\n\t Local representations"
+        if isinstance(self.local_rep, nn.Sequential):
+            for m in self.local_rep:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.local_rep)
+
+        repr_str += "\n\t Global representations with patch size of {}x{}".format(
+            self.patch_h,
+            self.patch_w,
+        )
+        if isinstance(self.global_rep, nn.Sequential):
+            for m in self.global_rep:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.global_rep)
+
+        if isinstance(self.conv_proj, nn.Sequential):
+            for m in self.conv_proj:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.conv_proj)
+
+        repr_str += "\n)"
+        return repr_str
+
+    def unfolding_pytorch(self, feature_map: Tensor) -> Tuple[Tensor, Tuple[int, int]]:
+
+        batch_size, in_channels, img_h, img_w = feature_map.shape
+
+        # [B, C, H, W] --> [B, C, P, N]
+        patches = F.unfold(
+            feature_map,
+            kernel_size=(self.patch_h, self.patch_w),
+            stride=(self.patch_h, self.patch_w),
+        )
+        patches = patches.reshape(
+            batch_size, in_channels, self.patch_h * self.patch_w, -1
+        )
+
+        return patches, (img_h, img_w)
+
+    def folding_pytorch(self, patches: Tensor, output_size: Tuple[int, int]) -> Tensor:
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+
+        # [B, C, P, N]
+        patches = patches.reshape(batch_size, in_dim * patch_size, n_patches)
+
+        feature_map = F.fold(
+            patches,
+            output_size=output_size,
+            kernel_size=(self.patch_h, self.patch_w),
+            stride=(self.patch_h, self.patch_w),
+        )
+
+        return feature_map
+
+    def unfolding_coreml(self, feature_map: Tensor) -> Tuple[Tensor, Tuple[int, int]]:
+        # im2col is not implemented in Coreml, so here we hack its implementation using conv2d
+        # we compute the weights
+
+        # [B, C, H, W] --> [B, C, P, N]
+        batch_size, in_channels, img_h, img_w = feature_map.shape
+        #
+        patches = F.conv2d(
+            feature_map,
+            self.unfolding_weights,
+            bias=None,
+            stride=(self.patch_h, self.patch_w),
+            padding=0,
+            dilation=1,
+            groups=in_channels,
+        )
+        patches = patches.reshape(
+            batch_size, in_channels, self.patch_h * self.patch_w, -1
+        )
+        return patches, (img_h, img_w)
+
+    def folding_coreml(self, patches: Tensor, output_size: Tuple[int, int]) -> Tensor:
+        # col2im is not supported on coreml, so tracing fails
+        # We hack folding function via pixel_shuffle to enable coreml tracing
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+
+        n_patches_h = output_size[0] // self.patch_h
+        n_patches_w = output_size[1] // self.patch_w
+
+        feature_map = patches.reshape(
+            batch_size, in_dim * self.patch_h * self.patch_w, n_patches_h, n_patches_w
+        )
+        assert (
+            self.patch_h == self.patch_w
+        ), "For Coreml, we need patch_h and patch_w are the same"
+        feature_map = F.pixel_shuffle(feature_map, upscale_factor=self.patch_h)
+        return feature_map
+
+    def resize_input_if_needed(self, x):
+        batch_size, in_channels, orig_h, orig_w = x.shape
+        if orig_h % self.patch_h != 0 or orig_w % self.patch_w != 0:
+            new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+            new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+            x = F.interpolate(
+                x, size=(new_h, new_w), mode="bilinear", align_corners=True
+            )
+        return x
+
+    def forward_spatial(self, x: Tensor, *args, **kwargs) -> Tensor:
+        x = self.resize_input_if_needed(x)
+
+        fm = self.local_rep(x)
+
+        # convert feature map to patches
+        if self.enable_coreml_compatible_fn:
+            patches, output_size = self.unfolding_coreml(fm)
+        else:
+            patches, output_size = self.unfolding_pytorch(fm)
+
+        # learn global representations on all patches
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        if self.enable_coreml_compatible_fn:
+            fm = self.folding_coreml(patches=patches, output_size=output_size)
+        else:
+            fm = self.folding_pytorch(patches=patches, output_size=output_size)
+        fm = self.conv_proj(fm)
+
+
+        return fm
+
+    def forward_temporal(
+        self, x: Tensor, x_prev: Tensor, *args, **kwargs
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        x = self.resize_input_if_needed(x)
+
+        fm = self.local_rep(x)
+
+        # convert feature map to patches
+        if self.enable_coreml_compatible_fn:
+            patches, output_size = self.unfolding_coreml(fm)
+        else:
+            patches, output_size = self.unfolding_pytorch(fm)
+
+        # learn global representations
+        for global_layer in self.global_rep:
+            if isinstance(global_layer, LinearAttnFFN):
+                patches = global_layer(x=patches, x_prev=x_prev)
+            else:
+                patches = global_layer(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        if self.enable_coreml_compatible_fn:
+            fm = self.folding_coreml(patches=patches, output_size=output_size)
+        else:
+            fm = self.folding_pytorch(patches=patches, output_size=output_size)
+        fm = self.conv_proj(fm)
+
+
+        return fm, patches
+
+    def forward(
+        self, x: Union[Tensor, Tuple[Tensor]], *args, **kwargs
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        if isinstance(x, Tuple) and len(x) == 2:
+            # for spatio-temporal data (e.g., videos)
+            return self.forward_temporal(x=x[0], x_prev=x[1])
+        elif isinstance(x, Tensor):
+            # for image data
+            return self.forward_spatial(x)
+        else:
+            raise NotImplementedError
+
+    def profile_module(
+        self, input: Tensor, *args, **kwargs
+    ) -> Tuple[Tensor, float, float]:
+        params = macs = 0.0
+        input = self.resize_input_if_needed(input)
+
+        res = input
+        out, p, m = module_profile(module=self.local_rep, x=input)
+        params += p
+        macs += m
+
+        patches, output_size = self.unfolding_pytorch(feature_map=out)
+
+        patches, p, m = module_profile(module=self.global_rep, x=patches)
+        params += p
+        macs += m
+
+        fm = self.folding_pytorch(patches=patches, output_size=output_size)
+
+        out, p, m = module_profile(module=self.conv_proj, x=fm)
+        params += p
+        macs += m
+
+        return res, params, macs
+
+# TODO: Add reference to MobileViTv3 paper
+
+
+class MobileViTBlockv3(BaseModule):
+    """
+    This class defines the `MobileViTv3 block <>`_
+
+    Args:
+        opts: command line arguments
+        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H, W)`
+        attn_unit_dim (int): Input dimension to the attention unit
+        ffn_multiplier (int): Expand the input dimensions by this factor in FFN. Default is 2.
+        n_attn_blocks (Optional[int]): Number of attention units. Default: 2
+        attn_dropout (Optional[float]): Dropout in multi-head attention. Default: 0.0
+        dropout (Optional[float]): Dropout rate. Default: 0.0
+        ffn_dropout (Optional[float]): Dropout between FFN layers in transformer. Default: 0.0
+        patch_h (Optional[int]): Patch height for unfolding operation. Default: 8
+        patch_w (Optional[int]): Patch width for unfolding operation. Default: 8
+        conv_ksize (Optional[int]): Kernel size to learn local representations in MobileViT block. Default: 3
+        dilation (Optional[int]): Dilation rate in convolutions. Default: 1
+        attn_norm_layer (Optional[str]): Normalization layer in the attention block. Default: layer_norm_2d
+    """
+
+    def __init__(
+        self,
+        opts,
+        in_channels: int,
+        attn_unit_dim: int,
+        ffn_multiplier: Optional[Union[Sequence[Union[int, float]], int, float]] = 2.0,
+        n_attn_blocks: Optional[int] = 2,
+        attn_dropout: Optional[float] = 0.0,
+        dropout: Optional[float] = 0.0,
+        ffn_dropout: Optional[float] = 0.0,
+        patch_h: Optional[int] = 8,
+        patch_w: Optional[int] = 8,
+        conv_ksize: Optional[int] = 3,
+        dilation: Optional[int] = 1,
+        attn_norm_layer: Optional[str] = "layer_norm_2d",
+        *args,
+        **kwargs
+    ) -> None:
+        cnn_out_dim = attn_unit_dim
+
+        conv_3x3_in = ConvLayer(
+            opts=opts,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=conv_ksize,
+            stride=1,
+            use_norm=True,
+            use_act=True,
+            dilation=dilation,
+            groups=in_channels,
+        )
+        conv_1x1_in = ConvLayer(
+            opts=opts,
+            in_channels=in_channels,
+            out_channels=cnn_out_dim,
+            kernel_size=1,
+            stride=1,
+            use_norm=False,
+            use_act=False,
+        )
+
+        super(MobileViTBlockv3, self).__init__()
+        self.local_rep = nn.Sequential(conv_3x3_in, conv_1x1_in)
+
+        self.global_rep, attn_unit_dim = self._build_attn_layer(
+            opts=opts,
+            d_model=attn_unit_dim,
+            ffn_mult=ffn_multiplier,
+            n_layers=n_attn_blocks,
+            attn_dropout=attn_dropout,
+            dropout=dropout,
+            ffn_dropout=ffn_dropout,
+            attn_norm_layer=attn_norm_layer,
+        )
+
+
+        # MobileViTv3: input changed from just global to local+global
+        self.conv_proj = ConvLayer(
+            opts=opts,
+            in_channels= 2 * cnn_out_dim,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            use_norm=True,
+            use_act=False,
+        )
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+        self.patch_area = self.patch_w * self.patch_h
+
+        self.cnn_in_dim = in_channels
+        self.cnn_out_dim = cnn_out_dim
+        self.transformer_in_dim = attn_unit_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.ffn_dropout = ffn_dropout
+        self.n_blocks = n_attn_blocks
+        self.conv_ksize = conv_ksize
+        self.enable_coreml_compatible_fn = getattr(
+            opts, "common.enable_coreml_compatible_module", False
+        )
+
+        if self.enable_coreml_compatible_fn:
+            # we set persistent to false so that these weights are not part of model's state_dict
+            self.register_buffer(
+                name="unfolding_weights",
+                tensor=self._compute_unfolding_weights(),
+                persistent=False,
+            )
+
+    def _compute_unfolding_weights(self) -> Tensor:
+        # [P_h * P_w, P_h * P_w]
+        weights = torch.eye(self.patch_h * self.patch_w, dtype=torch.float)
+        # [P_h * P_w, P_h * P_w] --> [P_h * P_w, 1, P_h, P_w]
+        weights = weights.reshape(
+            (self.patch_h * self.patch_w, 1, self.patch_h, self.patch_w)
+        )
+        # [P_h * P_w, 1, P_h, P_w] --> [P_h * P_w * C, 1, P_h, P_w]
+        weights = weights.repeat(self.cnn_out_dim, 1, 1, 1)
+        return weights
+
+    def _build_attn_layer(
+        self,
+        opts,
+        d_model: int,
+        ffn_mult: Union[Sequence, int, float],
+        n_layers: int,
+        attn_dropout: float,
+        dropout: float,
+        ffn_dropout: float,
+        attn_norm_layer: str,
+        *args,
+        **kwargs
+    ) -> Tuple[nn.Module, int]:
+
+        if isinstance(ffn_mult, Sequence) and len(ffn_mult) == 2:
+            ffn_dims = (
+                np.linspace(ffn_mult[0], ffn_mult[1], n_layers, dtype=float) * d_model
+            )
+        elif isinstance(ffn_mult, Sequence) and len(ffn_mult) == 1:
+            ffn_dims = [ffn_mult[0] * d_model] * n_layers
+        elif isinstance(ffn_mult, (int, float)):
+            ffn_dims = [ffn_mult * d_model] * n_layers
+        else:
+            raise NotImplementedError
+
+        # ensure that dims are multiple of 16
+        ffn_dims = [int((d // 16) * 16) for d in ffn_dims]
+
+        global_rep = [
+            LinearAttnFFN(
+                opts=opts,
+                embed_dim=d_model,
+                ffn_latent_dim=ffn_dims[block_idx],
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                norm_layer=attn_norm_layer,
+            )
+            for block_idx in range(n_layers)
+        ]
+        global_rep.append(
+            get_normalization_layer(
+                opts=opts, norm_type=attn_norm_layer, num_features=d_model
+            )
+        )
+
+        return nn.Sequential(*global_rep), d_model
+
+    def __repr__(self) -> str:
+        repr_str = "{}(".format(self.__class__.__name__)
+
+        repr_str += "\n\t Local representations"
+        if isinstance(self.local_rep, nn.Sequential):
+            for m in self.local_rep:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.local_rep)
+
+        repr_str += "\n\t Global representations with patch size of {}x{}".format(
+            self.patch_h,
+            self.patch_w,
+        )
+        if isinstance(self.global_rep, nn.Sequential):
+            for m in self.global_rep:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.global_rep)
+
+        if isinstance(self.conv_proj, nn.Sequential):
+            for m in self.conv_proj:
+                repr_str += "\n\t\t {}".format(m)
+        else:
+            repr_str += "\n\t\t {}".format(self.conv_proj)
+
+        repr_str += "\n)"
+        return repr_str
+
+    def unfolding_pytorch(self, feature_map: Tensor) -> Tuple[Tensor, Tuple[int, int]]:
+
+        batch_size, in_channels, img_h, img_w = feature_map.shape
+
+        # [B, C, H, W] --> [B, C, P, N]
+        patches = F.unfold(
+            feature_map,
+            kernel_size=(self.patch_h, self.patch_w),
+            stride=(self.patch_h, self.patch_w),
+        )
+        patches = patches.reshape(
+            batch_size, in_channels, self.patch_h * self.patch_w, -1
+        )
+
+        return patches, (img_h, img_w)
+
+    def folding_pytorch(self, patches: Tensor, output_size: Tuple[int, int]) -> Tensor:
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+
+        # [B, C, P, N]
+        patches = patches.reshape(batch_size, in_dim * patch_size, n_patches)
+
+        feature_map = F.fold(
+            patches,
+            output_size=output_size,
+            kernel_size=(self.patch_h, self.patch_w),
+            stride=(self.patch_h, self.patch_w),
+        )
+
+        return feature_map
+
+    def unfolding_coreml(self, feature_map: Tensor) -> Tuple[Tensor, Tuple[int, int]]:
+        # im2col is not implemented in Coreml, so here we hack its implementation using conv2d
+        # we compute the weights
+
+        # [B, C, H, W] --> [B, C, P, N]
+        batch_size, in_channels, img_h, img_w = feature_map.shape
+        #
+        patches = F.conv2d(
+            feature_map,
+            self.unfolding_weights,
+            bias=None,
+            stride=(self.patch_h, self.patch_w),
+            padding=0,
+            dilation=1,
+            groups=in_channels,
+        )
+        patches = patches.reshape(
+            batch_size, in_channels, self.patch_h * self.patch_w, -1
+        )
+        return patches, (img_h, img_w)
+
+    def folding_coreml(self, patches: Tensor, output_size: Tuple[int, int]) -> Tensor:
+        # col2im is not supported on coreml, so tracing fails
+        # We hack folding function via pixel_shuffle to enable coreml tracing
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+
+        n_patches_h = output_size[0] // self.patch_h
+        n_patches_w = output_size[1] // self.patch_w
+
+        feature_map = patches.reshape(
+            batch_size, in_dim * self.patch_h * self.patch_w, n_patches_h, n_patches_w
+        )
+        assert (
+            self.patch_h == self.patch_w
+        ), "For Coreml, we need patch_h and patch_w are the same"
+        feature_map = F.pixel_shuffle(feature_map, upscale_factor=self.patch_h)
+        return feature_map
+
+    def resize_input_if_needed(self, x):
+        batch_size, in_channels, orig_h, orig_w = x.shape
+        if orig_h % self.patch_h != 0 or orig_w % self.patch_w != 0:
+            new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+            new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+            x = F.interpolate(
+                x, size=(new_h, new_w), mode="bilinear", align_corners=True
+            )
+        return x
+
+    def forward_spatial(self, x: Tensor, *args, **kwargs) -> Tensor:
+        x = self.resize_input_if_needed(x)
+
+        fm_conv = self.local_rep(x)
+
+        # convert feature map to patches
+        if self.enable_coreml_compatible_fn:
+            patches, output_size = self.unfolding_coreml(fm_conv)
+        else:
+            patches, output_size = self.unfolding_pytorch(fm_conv)
+
+        # learn global representations on all patches
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        if self.enable_coreml_compatible_fn:
+            fm = self.folding_coreml(patches=patches, output_size=output_size)
+        else:
+            fm = self.folding_pytorch(patches=patches, output_size=output_size)
+
+        # MobileViTv3: local+global instead of only global
+        fm = self.conv_proj(torch.cat((fm,fm_conv), dim=1)
+                )
+
+        # MobileViTv3: skip connection
+        fm = fm + x
+
+        return fm
+
+    def forward_temporal(
+        self, x: Tensor, x_prev: Tensor, *args, **kwargs
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        x = self.resize_input_if_needed(x)
+
+        fm_conv = self.local_rep(x)
+
+        # convert feature map to patches
+        if self.enable_coreml_compatible_fn:
+            patches, output_size = self.unfolding_coreml(fm_conv)
+        else:
+            patches, output_size = self.unfolding_pytorch(fm_conv)
+
+        # learn global representations
+        for global_layer in self.global_rep:
+            if isinstance(global_layer, LinearAttnFFN):
+                patches = global_layer(x=patches, x_prev=x_prev)
+            else:
+                patches = global_layer(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        if self.enable_coreml_compatible_fn:
+            fm = self.folding_coreml(patches=patches, output_size=output_size)
+        else:
+            fm = self.folding_pytorch(patches=patches, output_size=output_size)
+
+        # MobileViTv3: local+global instead of only global
+        fm = self.conv_proj(torch.cat((fm,fm_conv), dim=1)
+                )
+
+        # MobileViTv3: skip connection
+        fm = fm + x
+
+        return fm, patches
+
+    def forward(
+        self, x: Union[Tensor, Tuple[Tensor]], *args, **kwargs
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        if isinstance(x, Tuple) and len(x) == 2:
+            # for spatio-temporal data (e.g., videos)
+            return self.forward_temporal(x=x[0], x_prev=x[1])
+        elif isinstance(x, Tensor):
+            # for image data
+            return self.forward_spatial(x)
+        else:
+            raise NotImplementedError
+
+    def profile_module(
+        self, input: Tensor, *args, **kwargs
+    ) -> Tuple[Tensor, float, float]:
+        params = macs = 0.0
+        input = self.resize_input_if_needed(input)
+
+        res = input
+        out, p, m = module_profile(module=self.local_rep, x=input)
+        params += p
+        macs += m
+
+        patches, output_size = self.unfolding_pytorch(feature_map=out)
+
+        patches, p, m = module_profile(module=self.global_rep, x=patches)
+        params += p
+        macs += m
+
+        fm = self.folding_pytorch(patches=patches, output_size=output_size)
+
+        out, p, m = module_profile(module=self.conv_proj, x=torch.cat((fm,out),dim=1))
+        params += p
+        macs += m
+
+        return res, params, macs
diff --git a/MobileViTv3-v2/environment_mbvt2.yml b/MobileViTv3-v2/environment_mbvt2.yml
new file mode 100644
index 0000000..9f1797b
--- /dev/null
+++ b/MobileViTv3-v2/environment_mbvt2.yml
@@ -0,0 +1,172 @@
+name: mbvt3
+channels:
+  - pytorch
+  - conda-forge
+  - anaconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - blosc=1.21.0=h8c45485_0
+  - brotli=1.0.9=he6710b0_2
+  - brunsli=0.1=h2531618_0
+  - bzip2=1.0.8=h7b6447c_0
+  - c-ares=1.18.1=h7f8727e_0
+  - ca-certificates=2022.5.18.1=ha878542_0
+  - cfitsio=3.470=hf0d0db6_6
+  - charls=2.2.0=h2531618_0
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - cloudpickle=2.0.0=pyhd3eb1b0_0
+  - cudatoolkit=11.3.1=h2bc3f7f_2
+  - dask-core=2022.2.1=pyhd3eb1b0_0
+  - ffmpeg=4.3=hf484d3e_0
+  - freetype=2.11.0=h70c0345_0
+  - fsspec=2022.2.0=pyhd3eb1b0_0
+  - giflib=5.2.1=h7b6447c_0
+  - gmp=6.2.1=h295c915_3
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.3=pyhd3eb1b0_0
+  - imageio=2.9.0=pyhd3eb1b0_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - jpeg=9e=h7f8727e_0
+  - jxrlib=1.1=h7b6447c_2
+  - krb5=1.19.2=hac12032_0
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libaec=1.0.4=he6710b0_1
+  - libcurl=7.82.0=h0b77cf5_0
+  - libdeflate=1.8=h7f8727e_5
+  - libedit=3.1.20210910=h7f8727e_0
+  - libev=4.33=h7f8727e_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgfortran-ng=7.5.0=ha8ba4b0_17
+  - libgfortran4=7.5.0=ha8ba4b0_17
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.2=h7f8727e_0
+  - libnghttp2=1.46.0=hce63b2e_0
+  - libpng=1.6.37=hbc83047_0
+  - libssh2=1.10.0=h8f2d780_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.16.0=h27cfd23_0
+  - libtiff=4.2.0=h85742a9_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.0.3=h7f8727e_2
+  - libuv=1.40.0=h7b6447c_0
+  - libwebp=1.2.2=h55f646e_0
+  - libwebp-base=1.2.2=h7f8727e_0
+  - libzopfli=1.0.3=he6710b0_0
+  - lz4-c=1.9.3=h295c915_1
+  - mkl=2021.4.0=h06a4308_640
+  - mkl_fft=1.3.1=py310hd6ae3a3_0
+  - mkl_random=1.2.2=py310h00e6091_0
+  - ncurses=6.3=h7f8727e_2
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=2.7.1=pyhd3eb1b0_0
+  - numpy-base=1.22.3=py310h9585f30_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.4.0=h3ad879b_0
+  - openssl=1.1.1o=h166bdaf_0
+  - packaging=21.3=pyhd3eb1b0_0
+  - partd=1.2.0=pyhd3eb1b0_1
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=22.0.0=pyhd3eb1b0_0
+  - pyparsing=3.0.4=pyhd3eb1b0_0
+  - python=3.10.4=h12debd9_0
+  - python_abi=3.10=2_cp310
+  - pytorch=1.11.0=py3.10_cuda11.3_cudnn8.2.0_0
+  - pytorch-mutex=1.0=cuda
+  - readline=8.1.2=h7f8727e_1
+  - requests=2.27.1=pyhd3eb1b0_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - snappy=1.1.9=h295c915_0
+  - sqlite=3.38.3=hc218d9a_0
+  - tifffile=2021.7.2=pyhd3eb1b0_2
+  - tk=8.6.12=h1ccaba5_0
+  - toolz=0.11.2=pyhd3eb1b0_0
+  - typing_extensions=4.1.1=pyh06a4308_0
+  - tzdata=2022a=hda174b7_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.5=h7f8727e_1
+  - yaml=0.2.5=h7b6447c_0
+  - zfp=0.5.5=h295c915_6
+  - zlib=1.2.12=h7f8727e_2
+  - zstd=1.4.9=haebb681_0
+  - pip:
+    - absl-py==1.1.0
+    - appdirs==1.4.4
+    - av==9.2.0
+    - black==22.3.0
+    - brotlipy==0.7.0
+    - cachetools==5.2.0
+    - certifi==2022.5.18.1
+    - cffi==1.15.0
+    - cityscapesscripts==2.2.0
+    - click==8.1.3
+    - coloredlogs==15.0.1
+    - coremltools==5.2.0
+    - cryptography==37.0.1
+    - cycler==0.11.0
+    - cytoolz==0.11.0
+    - fonttools==4.33.3
+    - fvcore==0.1.5.post20220512
+    - google-auth==2.6.6
+    - google-auth-oauthlib==0.4.6
+    - grpcio==1.46.3
+    - humanfriendly==10.0
+    - imagecodecs==2021.8.26
+    - iopath==0.1.9
+    - kiwisolver==1.4.2
+    - locket==0.2.1
+    - markdown==3.3.7
+    - matplotlib==3.5.2
+    - mkl-fft==1.3.1
+    - mkl-random==1.2.2
+    - mkl-service==2.4.0
+    - mpmath==1.2.1
+    - mypy-extensions==0.4.3
+    - numpy==1.22.3
+    - oauthlib==3.2.0
+    - opencv-contrib-python==4.5.5.64
+    - parameterized==0.8.1
+    - pathspec==0.9.0
+    - pillow==9.0.1
+    - pip==21.2.4
+    - platformdirs==2.5.2
+    - portalocker==2.4.0
+    - protobuf==3.20.0
+    - psutil==5.9.1
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pycocotools==2.0.4
+    - pyquaternion==0.9.9
+    - pysocks==1.7.1
+    - python-dateutil==2.8.2
+    - pytorchvideo==0.1.5
+    - pywavelets==1.3.0
+    - pyyaml==6.0
+    - requests-oauthlib==1.3.1
+    - rsa==4.8
+    - scikit-image==0.19.2
+    - scipy==1.7.3
+    - setuptools==61.2.0
+    - sympy==1.10.1
+    - tabulate==0.8.9
+    - tensorboard==2.9.0
+    - tensorboard-data-server==0.6.1
+    - tensorboard-plugin-wit==1.8.1
+    - termcolor==1.1.0
+    - tomli==2.0.1
+    - torch==1.11.0
+    - torchvision==0.12.0
+    - tqdm==4.64.0
+    - typing==3.7.4.3
+    - ujson==5.1.0
+    - urllib3==1.26.9
+    - werkzeug==2.1.2
+    - yacs==0.1.8
+prefix: ~/anaconda3/envs/mbvt2
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..216b3c6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,73 @@
+# MobileViTv3 : Mobile-Friendly Vision Transformer with Simple and Effective Fusion of Local, Global and Input Features
+
+This repository contains MobileViTv3's source code for training and evaluation and is inspired by MobileViT ([paper](https://arxiv.org/abs/2110.02178?context=cs.LG), [code](https://github.com/apple/ml-cvnets)).
+
+## Installation and Training models:
+We recommend to use Python 3.8+ and [PyTorch](https://pytorch.org) (version >= v1.8.0) with `conda` environment.
+For setting-up the python environment with conda, see [here](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html).
+
+
+### MobileViTv3\-S,XS,XXS
+Download [MobileViTv1](https://github.com/apple/ml-cvnets/tree/d38a116fe134a8cd5db18670764fdaafd39a5d4f) and replace the files provided in [MobileViTv3-v1](MobileViTv3-v1).
+
+Conda environment used for training: [environment_cvnet.yml](MobileViTv3-v1).
+
+Then install according to instructions provided in the downloaded repository.
+For training, use training-and-evaluation readme given in the downloaded repository.
+
+
+### MobileViTv3\-1.0,0.75,0.5
+Download [MobileViTv2](https://github.com/apple/ml-cvnets/tree/84d992f413e52c0468f86d23196efd9dad885e6f) and replace the files provided in [MobileViTv3-v2](MobileViTv3-v2).
+Then install according to the instructions provided in the downloaded repository.
+
+Conda environment used for training: [environment_mbvt2.yml](MobileViTv3-v2)
+
+Then install according to instructions provided in the downloaded repository.
+For training, use training-and-evaluation readme given in the downloaded repository.
+
+
+
+## Trained models:
+
+checkpoint\_ema\_best.pt files inside the model folder is used to generated the accuracy of models.
+
+## Classification ImageNet-1K:
+| Model name | Accuracy | foldername  |
+| :---: | :---: | :---: |
+| MobileViTv3\-S | 79.3 | mobilevitv3\_S\_e300\_7930 |
+| MobileViTv3\-XS | 76.7 | mobilevitv3\_XS\_e300\_7671 |
+| MobileViTv3\-XXS | 70.98 | mobilevitv3\_XXS\_e300\_7098 |
+| MobileViTv3\-1.0 | 78.64 | mobilevitv3\_1\_0\_0 |
+| MobileViTv3\-0.75 | 76.55 | mobilevitv3\_0\_7\_5 |
+| MobileViTv3\-0.5 | 72.33 | mobilevitv3\_0\_5\_0 |
+
+## Segmentation PASCAL VOC 2012:
+| Model name | mIoU | foldername  |
+| :---: | :---: | :---: |
+| MobileViTv3\-S | 79.59 | mobilevitv3\_S\_voc\_e50\_7959 |
+| MobileViTv3\-XS | 78.77 | mobilevitv3\_XS\_voc\_e50\_7877 |
+| MobileViTv3\-XXS | 74.01 | mobilevitv3\_XXS\_voc\_e50\_7404 |
+| MobileViTv3\-1.0 | 80.04 | mobilevitv3\_voc\_1\_0\_0 |
+| MobileViTv3\-0.5 | 76.48 | mobilevitv3\_voc\_0\_5\_0 |
+
+## Segmentation ADE20K:
+| Model name | mIoU | foldername  |
+| :---: | :---: | :---: |
+| MobileViTv3\-1.0 | 39.13 | mobilevitv3\_ade20k\_1\_0\_0 |
+| MobileViTv3\-0.75 | 36.43 |mobilevitv3\_ade20k\_0\_7\_5  |
+| MobileViTv3\-0.5 | 39.13 | mobilevitv3\_ade20k\_0\_5\_0 |
+
+## Detection COCO:
+| Model name | mAP | foldername  |
+| :---: | :---: | :---: |
+| MobileViTv3\-S | 27.3 | mobilevitv3\_S\_coco\_e200\_2730 |
+| MobileViTv3\-XS | 25.6 | mobilevitv3\_XS\_coco\_e200\_2560 |
+| MobileViTv3\-XXS | 19.3 | mobilevitv3\_XXS\_coco\_e200\_1930 |
+| MobileViTv3\-1.0 | 27.0 | mobilevitv3\_coco\_1\_0\_0 |
+| MobileViTv3\-0.75 | 25.0 | mobilevitv3\_coco\_0\_7\_5 |
+| MobileViTv3\-0.5 | 21.8 | mobilevitv3\_coco\_0\_5\_0 |
+
+
+## Citation
+
+MobileViTv3 paper reference will be added soon.