diff --git a/mmdet/models/roi_heads/bbox_heads/bbox_head.py b/mmdet/models/roi_heads/bbox_heads/bbox_head.py
index ac98f3c3e03..3b2e8aae083 100644
--- a/mmdet/models/roi_heads/bbox_heads/bbox_head.py
+++ b/mmdet/models/roi_heads/bbox_heads/bbox_head.py
@@ -87,8 +87,9 @@ def __init__(self,
             out_dim_reg = box_dim if reg_class_agnostic else \
                 box_dim * num_classes
             reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
-            reg_predictor_cfg_.update(
-                in_features=in_channels, out_features=out_dim_reg)
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=in_channels, out_features=out_dim_reg)
             self.fc_reg = MODELS.build(reg_predictor_cfg_)
         self.debug_imgs = None
         if init_cfg is None:
diff --git a/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
index 28d76c70c45..cb6aadd86d3 100644
--- a/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
+++ b/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -95,8 +95,9 @@ def __init__(self,
             out_dim_reg = box_dim if self.reg_class_agnostic else \
                 box_dim * self.num_classes
             reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
-            reg_predictor_cfg_.update(
-                in_features=self.reg_last_dim, out_features=out_dim_reg)
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=self.reg_last_dim, out_features=out_dim_reg)
             self.fc_reg = MODELS.build(reg_predictor_cfg_)
 
         if init_cfg is None:
diff --git a/projects/Detic/README.md b/projects/Detic/README.md
new file mode 100644
index 00000000000..4e99779342d
--- /dev/null
+++ b/projects/Detic/README.md
@@ -0,0 +1,154 @@
+# Detecting Twenty-thousand Classes using Image-level Supervision
+
+## Description
+
+**Detic**: A **Det**ector with **i**mage **c**lasses that can use image-level labels to easily train detectors.
+
+<p align="center"> <img src='https://github.com/facebookresearch/Detic/blob/main/docs/teaser.jpeg?raw=true' align="center" height="300px"> </p>
+
+> [**Detecting Twenty-thousand Classes using Image-level Supervision**](http://arxiv.org/abs/2201.02605),
+> Xingyi Zhou, Rohit Girdhar, Armand Joulin, Philipp Krähenbühl, Ishan Misra,
+> *ECCV 2022 ([arXiv 2201.02605](http://arxiv.org/abs/2201.02605))*
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+## Installation
+
+Detic requires to install CLIP.
+
+```shell
+pip install git+https://github.com/openai/CLIP.git
+```
+
+### Demo
+
+#### Inference with existing dataset vocabulary embeddings
+
+First, go to the Detic project folder.
+
+```shell
+cd projects/Detic
+```
+
+Then, download the pre-computed CLIP embeddings from [dataset metainfo](https://github.com/facebookresearch/Detic/tree/main/datasets/metadata) to the `datasets/metadata` folder.
+The CLIP embeddings will be loaded to the zero-shot classifier during inference.
+For example, you can download LVIS's class name embeddings with the following command:
+
+```shell
+wget -P datasets/metadata https://raw.githubusercontent.com/facebookresearch/Detic/main/datasets/metadata/lvis_v1_clip_a%2Bcname.npy
+```
+
+You can run demo like this:
+
+```shell
+python demo.py \
+  ${IMAGE_PATH} \
+  ${CONFIG_PATH} \
+  ${MODEL_PATH} \
+  --show \
+  --score-thr 0.5 \
+  --dataset lvis
+```
+
+![image](https://user-images.githubusercontent.com/12907710/213624759-f0a2ba0c-0f5c-4424-a350-5ba5349e5842.png)
+
+### Inference with custom vocabularies
+
+- Detic can detects any class given class names by using CLIP.
+
+You can detect custom classes with `--class-name` command:
+
+```
+python demo.py \
+  ${IMAGE_PATH} \
+  ${CONFIG_PATH} \
+  ${MODEL_PATH} \
+  --show \
+  --score-thr 0.3 \
+  --class-name headphone webcam paper coffe
+```
+
+![image](https://user-images.githubusercontent.com/12907710/213624637-e9e8a313-9821-4782-a18a-4408c876852b.png)
+
+Note that `headphone`, `paper` and `coffe` (typo intended) are not LVIS classes. Despite the misspelled class name, Detic can produce a reasonable detection for `coffe`.
+
+## Results
+
+Here we only provide the Detic Swin-B model for the open vocabulary demo. Multi-dataset training and open-vocabulary testing will be supported in the future.
+
+To find more variants, please visit the [official model zoo](https://github.com/facebookresearch/Detic/blob/main/docs/MODEL_ZOO.md).
+
+| Backbone |       Training data        |                                Config                                 |                                                                                      Download                                                                                      |
+| :------: | :------------------------: | :-------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Swin-B  | ImageNet-21K & LVIS & COCO | [config](./configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k_20230120-0d301978.pth) |
+
+## Citation
+
+If you find Detic is useful in your research or applications, please consider giving a star 🌟 to the [official repository](https://github.com/facebookresearch/Detic) and citing Detic by the following BibTeX entry.
+
+```BibTeX
+@inproceedings{zhou2022detecting,
+  title={Detecting Twenty-thousand Classes using Image-level Supervision},
+  author={Zhou, Xingyi and Girdhar, Rohit and Joulin, Armand and Kr{\"a}henb{\"u}hl, Philipp and Misra, Ishan},
+  booktitle={ECCV},
+  year={2022}
+}
+
+```
+
+## Checklist
+
+<!-- Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress. The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmdet.registry.MODELS` and configurable via a config file. -->
+
+  - [x] Basic docstrings & proper citation
+
+    <!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [x] Test-time correctness
+
+    <!-- If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
+
+  - [x] A full README
+
+    <!-- As this template does. -->
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+    <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+    <!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmdetection/blob/5b0d5b40d5c6cfda906db7464ca22cbd4396728a/mmdet/datasets/transforms/transforms.py#L41-L169) -->
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmdetection/blob/5b0d5b40d5c6cfda906db7464ca22cbd4396728a/tests/test_datasets/test_transforms/test_transforms.py#L35-L88) -->
+
+  - [ ] Code polishing
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] Metafile.yml
+
+    <!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmdetection/blob/3.x/configs/faster_rcnn/metafile.yml) -->
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+  <!-- In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmdetection/blob/3.x/configs/faster_rcnn/README.md) -->
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py b/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py
new file mode 100644
index 00000000000..19a17aea7bc
--- /dev/null
+++ b/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py
@@ -0,0 +1,298 @@
+_base_ = 'mmdet::common/lsj-200e_coco-detection.py'
+
+custom_imports = dict(
+    imports=['projects.Detic.detic'], allow_failed_imports=False)
+
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+cls_layer = dict(
+    type='ZeroShotClassifier',
+    zs_weight_path='rand',
+    zs_weight_dim=512,
+    use_bias=0.0,
+    norm_weight=True,
+    norm_temperature=50.0)
+reg_layer = [
+    dict(type='Linear', in_features=1024, out_features=1024),
+    dict(type='ReLU', inplace=True),
+    dict(type='Linear', in_features=1024, out_features=4)
+]
+
+num_classes = 22047
+
+model = dict(
+    type='CascadeRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=False),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024],
+        out_channels=256,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=5,
+        init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'),
+        relu_before_extra_convs=True),
+    rpn_head=dict(
+        type='CenterNetRPNHead',
+        num_classes=1,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        conv_bias=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        loss_cls=dict(
+            type='GaussianFocalLoss',
+            pos_weight=0.25,
+            neg_weight=0.75,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+    ),
+    roi_head=dict(
+        type='DeticRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=0,
+                use_torchvision=True),
+            out_channels=256,
+            featmap_strides=[8, 16, 32],
+            # approximately equal to
+            # canonical_box_size=224, canonical_level=4 in D2
+            finest_scale=112),
+        bbox_head=[
+            dict(
+                type='DeticBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=num_classes,
+                cls_predictor_cfg=cls_layer,
+                reg_predictor_cfg=reg_layer,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=True,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='DeticBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=num_classes,
+                cls_predictor_cfg=cls_layer,
+                reg_predictor_cfg=reg_layer,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=True,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='DeticBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=num_classes,
+                cls_predictor_cfg=cls_layer,
+                reg_predictor_cfg=reg_layer,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=True,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[8, 16, 32],
+            # approximately equal to
+            # canonical_box_size=224, canonical_level=4 in D2
+            finest_scale=112),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            class_agnostic=True,
+            num_classes=num_classes,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.8,
+                    neg_iou_thr=0.8,
+                    min_pos_iou=0.8,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            score_thr=0.0001,
+            nms_pre=1000,
+            max_per_img=256,
+            nms=dict(type='nms', iou_threshold=0.9),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=300,
+            mask_thr_binary=0.5)))
+
+backend = 'pillow'
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        file_client_args=_base_.file_client_args,
+        imdecode_backend=backend),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004),
+    paramwise_cfg=dict(norm_decay_mult=0.))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.00025,
+        by_epoch=False,
+        begin=0,
+        end=4000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/projects/Detic/demo.py b/projects/Detic/demo.py
new file mode 100644
index 00000000000..d5c80c9aa5f
--- /dev/null
+++ b/projects/Detic/demo.py
@@ -0,0 +1,142 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import urllib
+from argparse import ArgumentParser
+
+import mmcv
+import torch
+from mmengine.logging import print_log
+from mmengine.utils import ProgressBar, scandir
+
+from mmdet.apis import inference_detector, init_detector
+from mmdet.registry import VISUALIZERS
+from mmdet.utils import register_all_modules
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def get_file_list(source_root: str) -> [list, dict]:
+    """Get file list.
+
+    Args:
+        source_root (str): image or video source path
+
+    Return:
+        source_file_path_list (list): A list for all source file.
+        source_type (dict): Source type: file or url or dir.
+    """
+    is_dir = os.path.isdir(source_root)
+    is_url = source_root.startswith(('http:/', 'https:/'))
+    is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS
+
+    source_file_path_list = []
+    if is_dir:
+        # when input source is dir
+        for file in scandir(source_root, IMG_EXTENSIONS, recursive=True):
+            source_file_path_list.append(os.path.join(source_root, file))
+    elif is_url:
+        # when input source is url
+        filename = os.path.basename(
+            urllib.parse.unquote(source_root).split('?')[0])
+        file_save_path = os.path.join(os.getcwd(), filename)
+        print(f'Downloading source file to {file_save_path}')
+        torch.hub.download_url_to_file(source_root, file_save_path)
+        source_file_path_list = [file_save_path]
+    elif is_file:
+        # when input source is single image
+        source_file_path_list = [source_root]
+    else:
+        print('Cannot find image file.')
+
+    source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file)
+
+    return source_file_path_list, source_type
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        'img', help='Image path, include image file, dir and URL.')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--out-dir', default='./output', help='Path to output file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--show', action='store_true', help='Show the detection results')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.3, help='Bbox score threshold')
+    parser.add_argument(
+        '--dataset', type=str, help='dataset name to load the text embedding')
+    parser.add_argument(
+        '--class-name', nargs='+', type=str, help='custom class names')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # register all modules in mmdet into the registries
+    register_all_modules()
+
+    # build the model from a config file and a checkpoint file
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+
+    if not os.path.exists(args.out_dir) and not args.show:
+        os.mkdir(args.out_dir)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    # get file list
+    files, source_type = get_file_list(args.img)
+    from detic.utils import (get_class_names, get_text_embeddings,
+                             reset_cls_layer_weight)
+
+    # class name embeddings
+    if args.class_name:
+        dataset_classes = args.class_name
+    elif args.dataset:
+        dataset_classes = get_class_names(args.dataset)
+    embedding = get_text_embeddings(
+        dataset=args.dataset, custom_vocabulary=args.class_name)
+    visualizer.dataset_meta['classes'] = dataset_classes
+    reset_cls_layer_weight(model, embedding)
+
+    # start detector inference
+    progress_bar = ProgressBar(len(files))
+    for file in files:
+        result = inference_detector(model, file)
+
+        img = mmcv.imread(file)
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
+
+        if source_type['is_dir']:
+            filename = os.path.relpath(file, args.img).replace('/', '_')
+        else:
+            filename = os.path.basename(file)
+        out_file = None if args.show else os.path.join(args.out_dir, filename)
+
+        progress_bar.update()
+
+        visualizer.add_datasample(
+            filename,
+            img,
+            data_sample=result,
+            draw_gt=False,
+            show=args.show,
+            wait_time=0,
+            out_file=out_file,
+            pred_score_thr=args.score_thr)
+
+    if not args.show:
+        print_log(
+            f'\nResults have been saved at {os.path.abspath(args.out_dir)}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/Detic/detic/__init__.py b/projects/Detic/detic/__init__.py
new file mode 100644
index 00000000000..d0ad070259a
--- /dev/null
+++ b/projects/Detic/detic/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .centernet_rpn_head import CenterNetRPNHead
+from .detic_bbox_head import DeticBBoxHead
+from .detic_roi_head import DeticRoIHead
+from .zero_shot_classifier import ZeroShotClassifier
+
+__all__ = [
+    'CenterNetRPNHead', 'DeticBBoxHead', 'DeticRoIHead', 'ZeroShotClassifier'
+]
diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py
new file mode 100644
index 00000000000..765d6dfb2b6
--- /dev/null
+++ b/projects/Detic/detic/centernet_rpn_head.py
@@ -0,0 +1,196 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.dense_heads import CenterNetUpdateHead
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS
+
+INF = 1000000000
+RangeType = Sequence[Tuple[int, int]]
+
+
+@MODELS.register_module(force=True)  # avoid bug
+class CenterNetRPNHead(CenterNetUpdateHead):
+    """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2.
+
+    Paper link `<https://arxiv.org/abs/2103.07461>`_.
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_predictor(self) -> None:
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.num_classes, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of each level outputs.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each \
+            scale level, each is a 4D-tensor, the channel number is 4.
+        """
+        res = multi_apply(self.forward_single, x, self.scales, self.strides)
+        return res
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps.
+
+        Returns:
+            tuple: scores for each class, bbox predictions of
+            input feature maps.
+        """
+        for m in self.reg_convs:
+            x = m(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        if not self.training:
+            bbox_pred *= stride
+        return cls_score, bbox_pred  # score aligned, box larger
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            heatmap = cls_score.sigmoid()
+            score_thr = cfg.get('score_thr', 0)
+
+            candidate_inds = heatmap > score_thr  # 0.05
+            pre_nms_top_n = candidate_inds.sum()  # N
+            pre_nms_top_n = pre_nms_top_n.clamp(max=nms_pre)  # N
+
+            heatmap = heatmap[candidate_inds]  # n
+
+            candidate_nonzeros = candidate_inds.nonzero()  # n
+            box_loc = candidate_nonzeros[:, 0]  # n
+            labels = candidate_nonzeros[:, 1]  # n
+
+            bbox_pred = bbox_pred[box_loc]  # n x 4
+            per_grids = priors[box_loc]  # n x 2
+
+            if candidate_inds.sum().item() > pre_nms_top_n.item():
+                heatmap, top_k_indices = \
+                    heatmap.topk(pre_nms_top_n, sorted=False)
+                labels = labels[top_k_indices]
+                bbox_pred = bbox_pred[top_k_indices]
+                per_grids = per_grids[top_k_indices]
+
+            bboxes = self.bbox_coder.decode(per_grids, bbox_pred)
+            # avoid invalid boxes in RoI heads
+            bboxes[:, 2] = torch.max(bboxes[:, 2], bboxes[:, 0] + 0.01)
+            bboxes[:, 3] = torch.max(bboxes[:, 3], bboxes[:, 1] + 0.01)
+
+            mlvl_bbox_preds.append(bboxes)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(torch.sqrt(heatmap))
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bbox_preds)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
diff --git a/projects/Detic/detic/detic_bbox_head.py b/projects/Detic/detic/detic_bbox_head.py
new file mode 100644
index 00000000000..9408cbe04fd
--- /dev/null
+++ b/projects/Detic/detic/detic_bbox_head.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.roi_heads.bbox_heads import Shared2FCBBoxHead
+from mmdet.models.utils import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import get_box_tensor, scale_boxes
+
+
+@MODELS.register_module(force=True)  # avoid bug
+class DeticBBoxHead(Shared2FCBBoxHead):
+
+    def __init__(self,
+                 *args,
+                 init_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        # reconstruct fc_cls and fc_reg since input channels are changed
+        assert self.with_cls
+        cls_channels = self.num_classes
+        cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
+        cls_predictor_cfg_.update(
+            in_features=self.cls_last_dim, out_features=cls_channels)
+        self.fc_cls = MODELS.build(cls_predictor_cfg_)
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tensor,
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas.
+                has shape (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image\
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+        if roi.shape[0] == 0:
+            return empty_instances([img_meta],
+                                   roi.device,
+                                   task_type='bbox',
+                                   instance_results=[results],
+                                   box_type=self.predict_box_type,
+                                   use_box_type=False,
+                                   num_classes=self.num_classes,
+                                   score_per_cls=rcnn_test_cfg is None)[0]
+        scores = cls_score
+        img_shape = img_meta['img_shape']
+        num_rois = roi.size(0)
+
+        num_classes = 1 if self.reg_class_agnostic else self.num_classes
+        roi = roi.repeat_interleave(num_classes, dim=0)
+        bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size)
+        bboxes = self.bbox_coder.decode(
+            roi[..., 1:], bbox_pred, max_shape=img_shape)
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            bboxes = scale_boxes(bboxes, scale_factor)
+
+        # Get the inside tensor when `bboxes` is a box type
+        bboxes = get_box_tensor(bboxes)
+        box_dim = bboxes.size(-1)
+        bboxes = bboxes.view(num_rois, -1)
+
+        if rcnn_test_cfg is None:
+            # This means that it is aug test.
+            # It needs to return the raw results without nms.
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms,
+                rcnn_test_cfg.max_per_img,
+                box_dim=box_dim)
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_bboxes[:, -1]
+            results.labels = det_labels
+        return results
diff --git a/projects/Detic/detic/detic_roi_head.py b/projects/Detic/detic/detic_roi_head.py
new file mode 100644
index 00000000000..a09c11c6e69
--- /dev/null
+++ b/projects/Detic/detic/detic_roi_head.py
@@ -0,0 +1,326 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.roi_heads import CascadeRoIHead
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.test_time_augs import merge_aug_masks
+from mmdet.models.utils.misc import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi, get_box_tensor
+from mmdet.utils import ConfigType, InstanceList, MultiConfig
+
+
+@MODELS.register_module(force=True)  # avoid bug
+class DeticRoIHead(CascadeRoIHead):
+
+    def init_mask_head(self, mask_roi_extractor: MultiConfig,
+                       mask_head: MultiConfig) -> None:
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_head (dict): Config of mask in mask head.
+            mask_roi_extractor (:obj:`ConfigDict`, dict or list):
+                Config of mask roi extractor.
+        """
+        self.mask_head = MODELS.build(mask_head)
+
+        if mask_roi_extractor is not None:
+            self.share_roi_extractor = False
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+
+    def _refine_roi(self, x: Tuple[Tensor], rois: Tensor,
+                    batch_img_metas: List[dict],
+                    num_proposals_per_img: Sequence[int], **kwargs) -> tuple:
+        """Multi-stage refinement of RoI.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2]
+            batch_img_metas (list[dict]): List of image information.
+            num_proposals_per_img (sequence[int]): number of proposals
+                in each image.
+
+        Returns:
+            tuple:
+
+               - rois (Tensor): Refined RoI.
+               - cls_scores (list[Tensor]): Average predicted
+                   cls score per image.
+               - bbox_preds (list[Tensor]): Bbox branch predictions
+                   for the last stage of per image.
+        """
+        # "ms" in variable names means multi-stage
+        ms_scores = []
+        for stage in range(self.num_stages):
+            bbox_results = self._bbox_forward(
+                stage=stage, x=x, rois=rois, **kwargs)
+
+            # split batch bbox prediction back to each image
+            cls_scores = bbox_results['cls_score'].sigmoid()
+            bbox_preds = bbox_results['bbox_pred']
+
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_scores = cls_scores.split(num_proposals_per_img, 0)
+            ms_scores.append(cls_scores)
+            bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                refine_rois_list = []
+                for i in range(len(batch_img_metas)):
+                    if rois[i].shape[0] > 0:
+                        bbox_label = cls_scores[i][:, :-1].argmax(dim=1)
+                        # Refactor `bbox_head.regress_by_class` to only accept
+                        # box tensor without img_idx concatenated.
+                        refined_bboxes = bbox_head.regress_by_class(
+                            rois[i][:, 1:], bbox_label, bbox_preds[i],
+                            batch_img_metas[i])
+                        refined_bboxes = get_box_tensor(refined_bboxes)
+                        refined_rois = torch.cat(
+                            [rois[i][:, [0]], refined_bboxes], dim=1)
+                        refine_rois_list.append(refined_rois)
+                rois = torch.cat(refine_rois_list)
+        # ms_scores aligned
+        # average scores of each image by stages
+        cls_scores = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(len(batch_img_metas))
+        ]  # aligned
+        return rois, cls_scores, bbox_preds
+
+    def _bbox_forward(self, stage: int, x: Tuple[Tensor],
+                      rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False,
+                     **kwargs) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        proposal_scores = [res.scores for res in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='bbox',
+                box_type=self.bbox_head[-1].predict_box_type,
+                num_classes=self.bbox_head[-1].num_classes,
+                score_per_cls=rcnn_test_cfg is None)
+        # rois aligned
+        rois, cls_scores, bbox_preds = self._refine_roi(
+            x=x,
+            rois=rois,
+            batch_img_metas=batch_img_metas,
+            num_proposals_per_img=num_proposals_per_img,
+            **kwargs)
+
+        # score reweighting in centernet2
+        cls_scores = [(s * ps[:, None])**0.5
+                      for s, ps in zip(cls_scores, proposal_scores)]
+        cls_scores = [
+            s * (s == s[:, :-1].max(dim=1)[0][:, None]).float()
+            for s in cls_scores
+        ]
+
+        # fast_rcnn_inference
+        results_list = self.bbox_head[-1].predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale,
+            rcnn_test_cfg=rcnn_test_cfg)
+        return results_list
+
+    def _mask_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_feats = self.mask_roi_extractor(
+            x[:self.mask_roi_extractor.num_inputs], rois)
+        # do not support caffe_c4 model anymore
+        mask_preds = self.mask_head(mask_feats)
+
+        mask_results = dict(mask_preds=mask_preds)
+        return mask_results
+
+    def mask_loss(self, x, sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(x, pos_rois)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[-1])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        raise NotImplementedError
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: List[InstanceData],
+                     rescale: bool = False) -> List[InstanceData]:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        aug_masks = []
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        # split batch mask prediction back to each image
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+        aug_masks.append([m.sigmoid().detach() for m in mask_preds])
+
+        merged_masks = []
+        for i in range(len(batch_img_metas)):
+            aug_mask = [mask[i] for mask in aug_masks]
+            merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+            merged_masks.append(merged_mask)
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=merged_masks,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale,
+            activate_map=True)
+        return results_list
diff --git a/projects/Detic/detic/text_encoder.py b/projects/Detic/detic/text_encoder.py
new file mode 100644
index 00000000000..f0024efaf30
--- /dev/null
+++ b/projects/Detic/detic/text_encoder.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+
+
+class CLIPTextEncoder(nn.Module):
+
+    def __init__(self, model_name='ViT-B/32'):
+        super().__init__()
+        import clip
+        from clip.simple_tokenizer import SimpleTokenizer
+        self.tokenizer = SimpleTokenizer()
+        pretrained_model, _ = clip.load(model_name, device='cpu')
+        self.clip = pretrained_model
+
+    @property
+    def device(self):
+        return self.clip.device
+
+    @property
+    def dtype(self):
+        return self.clip.dtype
+
+    def tokenize(self,
+                 texts: Union[str, List[str]],
+                 context_length: int = 77) -> torch.LongTensor:
+        if isinstance(texts, str):
+            texts = [texts]
+
+        sot_token = self.tokenizer.encoder['<|startoftext|>']
+        eot_token = self.tokenizer.encoder['<|endoftext|>']
+        all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token]
+                      for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                st = torch.randint(len(tokens) - context_length + 1,
+                                   (1, ))[0].item()
+                tokens = tokens[st:st + context_length]
+            result[i, :len(tokens)] = torch.tensor(tokens)
+
+        return result
+
+    def forward(self, text):
+        text = self.tokenize(text)
+        text_features = self.clip.encode_text(text)
+        return text_features
diff --git a/projects/Detic/detic/utils.py b/projects/Detic/detic/utils.py
new file mode 100644
index 00000000000..56d4fd429d7
--- /dev/null
+++ b/projects/Detic/detic/utils.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.logging import print_log
+
+from .text_encoder import CLIPTextEncoder
+
+# download from
+# https://github.com/facebookresearch/Detic/tree/main/datasets/metadata
+DATASET_EMBEDDINGS = {
+    'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy',
+    'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy',
+    'openimages': 'datasets/metadata/oid_clip_a+cname.npy',
+    'coco': 'datasets/metadata/coco_clip_a+cname.npy',
+}
+
+
+def get_text_embeddings(dataset=None,
+                        custom_vocabulary=None,
+                        prompt_prefix='a '):
+    assert (dataset is None) ^ (custom_vocabulary is None), \
+        'Either `dataset` or `custom_vocabulary` should be specified.'
+    if dataset:
+        if dataset in DATASET_EMBEDDINGS:
+            return DATASET_EMBEDDINGS[dataset]
+        else:
+            custom_vocabulary = get_class_names(dataset)
+
+    text_encoder = CLIPTextEncoder()
+    text_encoder.eval()
+    texts = [prompt_prefix + x for x in custom_vocabulary]
+    print_log(
+        f'Computing text embeddings for {len(custom_vocabulary)} classes.')
+    embeddings = text_encoder(texts).detach().permute(1, 0).contiguous().cpu()
+    return embeddings
+
+
+def get_class_names(dataset):
+    if dataset == 'coco':
+        from mmdet.datasets import CocoDataset
+        class_names = CocoDataset.METAINFO['classes']
+    elif dataset == 'cityscapes':
+        from mmdet.datasets import CityscapesDataset
+        class_names = CityscapesDataset.METAINFO['classes']
+    elif dataset == 'voc':
+        from mmdet.datasets import VOCDataset
+        class_names = VOCDataset.METAINFO['classes']
+    elif dataset == 'openimages':
+        from mmdet.datasets import OpenImagesDataset
+        class_names = OpenImagesDataset.METAINFO['classes']
+    elif dataset == 'lvis':
+        from mmdet.datasets import LVISV1Dataset
+        class_names = LVISV1Dataset.METAINFO['classes']
+    else:
+        raise TypeError(f'Invalid type for dataset name: {type(dataset)}')
+    return class_names
+
+
+def reset_cls_layer_weight(model, weight):
+    if type(weight) == str:
+        print_log(f'Resetting cls_layer_weight from file: {weight}')
+        zs_weight = torch.tensor(
+            np.load(weight),
+            dtype=torch.float32).permute(1, 0).contiguous()  # D x C
+    else:
+        zs_weight = weight
+    zs_weight = torch.cat(
+        [zs_weight, zs_weight.new_zeros(
+            (zs_weight.shape[0], 1))], dim=1)  # D x (C + 1)
+    zs_weight = F.normalize(zs_weight, p=2, dim=0)
+    zs_weight = zs_weight.to('cuda')
+    num_classes = zs_weight.shape[-1]
+
+    for bbox_head in model.roi_head.bbox_head:
+        bbox_head.num_classes = num_classes
+        del bbox_head.fc_cls.zs_weight
+        bbox_head.fc_cls.zs_weight = zs_weight
diff --git a/projects/Detic/detic/zero_shot_classifier.py b/projects/Detic/detic/zero_shot_classifier.py
new file mode 100644
index 00000000000..35c9e49285c
--- /dev/null
+++ b/projects/Detic/detic/zero_shot_classifier.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module(force=True)  # avoid bug
+class ZeroShotClassifier(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,  # num_classes
+        zs_weight_path: str,
+        zs_weight_dim: int = 512,
+        use_bias: float = 0.0,
+        norm_weight: bool = True,
+        norm_temperature: float = 50.0,
+    ):
+        super().__init__()
+        num_classes = out_features
+        self.norm_weight = norm_weight
+        self.norm_temperature = norm_temperature
+
+        self.use_bias = use_bias < 0
+        if self.use_bias:
+            self.cls_bias = nn.Parameter(torch.ones(1) * use_bias)
+
+        self.linear = nn.Linear(in_features, zs_weight_dim)
+
+        if zs_weight_path == 'rand':
+            zs_weight = torch.randn((zs_weight_dim, num_classes))
+            nn.init.normal_(zs_weight, std=0.01)
+        else:
+            zs_weight = torch.tensor(
+                np.load(zs_weight_path),
+                dtype=torch.float32).permute(1, 0).contiguous()  # D x C
+        zs_weight = torch.cat(
+            [zs_weight, zs_weight.new_zeros(
+                (zs_weight_dim, 1))], dim=1)  # D x (C + 1)
+
+        if self.norm_weight:
+            zs_weight = F.normalize(zs_weight, p=2, dim=0)
+
+        if zs_weight_path == 'rand':
+            self.zs_weight = nn.Parameter(zs_weight)
+        else:
+            self.register_buffer('zs_weight', zs_weight)
+
+        assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape
+
+    def forward(self, x, classifier=None):
+        '''
+        Inputs:
+            x: B x D'
+            classifier_info: (C', C' x D)
+        '''
+        x = self.linear(x)
+        if classifier is not None:
+            zs_weight = classifier.permute(1, 0).contiguous()  # D x C'
+            zs_weight = F.normalize(zs_weight, p=2, dim=0) \
+                if self.norm_weight else zs_weight
+        else:
+            zs_weight = self.zs_weight
+        if self.norm_weight:
+            x = self.norm_temperature * F.normalize(x, p=2, dim=1)
+        x = torch.mm(x, zs_weight)
+        if self.use_bias:
+            x = x + self.cls_bias
+        return x