From dcf2c7dc57ea98be2f586a1d5f520e401adbbe94 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Mon, 16 Jan 2023 10:43:44 +0800
Subject: [PATCH 01/12] [Feature] Support Detic inference.

---
 .../models/roi_heads/bbox_heads/bbox_head.py  |   5 +-
 .../roi_heads/bbox_heads/convfc_bbox_head.py  |   5 +-
 .../Detic/configs/detic_centernet2_r50_fpn.py | 249 ++++++++++++++++++
 projects/Detic/detic/centernet_rpn_head.py    | 151 +++++++++++
 4 files changed, 406 insertions(+), 4 deletions(-)
 create mode 100644 projects/Detic/configs/detic_centernet2_r50_fpn.py
 create mode 100644 projects/Detic/detic/centernet_rpn_head.py

diff --git a/mmdet/models/roi_heads/bbox_heads/bbox_head.py b/mmdet/models/roi_heads/bbox_heads/bbox_head.py
index ac98f3c3e03..3b2e8aae083 100644
--- a/mmdet/models/roi_heads/bbox_heads/bbox_head.py
+++ b/mmdet/models/roi_heads/bbox_heads/bbox_head.py
@@ -87,8 +87,9 @@ def __init__(self,
             out_dim_reg = box_dim if reg_class_agnostic else \
                 box_dim * num_classes
             reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
-            reg_predictor_cfg_.update(
-                in_features=in_channels, out_features=out_dim_reg)
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=in_channels, out_features=out_dim_reg)
             self.fc_reg = MODELS.build(reg_predictor_cfg_)
         self.debug_imgs = None
         if init_cfg is None:
diff --git a/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
index 28d76c70c45..cb6aadd86d3 100644
--- a/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
+++ b/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -95,8 +95,9 @@ def __init__(self,
             out_dim_reg = box_dim if self.reg_class_agnostic else \
                 box_dim * self.num_classes
             reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
-            reg_predictor_cfg_.update(
-                in_features=self.reg_last_dim, out_features=out_dim_reg)
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=self.reg_last_dim, out_features=out_dim_reg)
             self.fc_reg = MODELS.build(reg_predictor_cfg_)
 
         if init_cfg is None:
diff --git a/projects/Detic/configs/detic_centernet2_r50_fpn.py b/projects/Detic/configs/detic_centernet2_r50_fpn.py
new file mode 100644
index 00000000000..4d9e75e3c63
--- /dev/null
+++ b/projects/Detic/configs/detic_centernet2_r50_fpn.py
@@ -0,0 +1,249 @@
+_base_ = 'mmdet::common/lsj-200e_coco-detection.py'
+
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+model = dict(
+    type='CascadeRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'),
+        relu_before_extra_convs=True),
+    rpn_head=dict(
+        type='CenterNetUpdateHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        norm_cfg=None,
+        loss_cls=dict(
+            type='GaussianFocalLoss',
+            pos_weight=0.25,
+            neg_weight=0.75,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+    ),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                reg_predictor_cfg=[dict(type='Linear', in_features=256, out_features=256),
+                                   dict(type='ReLU', inplace=True),
+                                   dict(type='Linear', in_features=256, out_features=4)],
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                reg_predictor_cfg=[dict(type='Linear', in_features=256, out_features=256),
+                                   dict(type='ReLU', inplace=True),
+                                   dict(type='Linear', in_features=256, out_features=4)],
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                reg_predictor_cfg=[dict(type='Linear', in_features=256, out_features=256),
+                                   dict(type='ReLU', inplace=True),
+                                   dict(type='Linear', in_features=256, out_features=4)],
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
+
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004),
+    paramwise_cfg=dict(norm_decay_mult=0.))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.00025,
+        by_epoch=False,
+        begin=0,
+        end=4000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py
new file mode 100644
index 00000000000..724df9f1bad
--- /dev/null
+++ b/projects/Detic/detic/centernet_rpn_head.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.dense_heads import CenterNetUpdateHead
+from mmdet.registry import MODELS
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2distance
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from mmdet.models.utils import multi_apply
+
+INF = 1000000000
+RangeType = Sequence[Tuple[int, int]]
+
+
+@MODELS.register_module()
+class CenterNetRPNHead(CenterNetUpdateHead):
+    """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2.
+    Paper link `<https://arxiv.org/abs/2103.07461>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channel in the input feature map.
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        hm_min_radius (int): Heatmap target minimum radius of cls branch.
+            Defaults to 4.
+        hm_min_overlap (float): Heatmap target minimum overlap of cls branch.
+            Defaults to 0.8.
+        more_pos_thresh (float): The filtering threshold when the cls branch
+            adds more positive samples. Defaults to 0.2.
+        more_pos_topk (int): The maximum number of additional positive samples
+            added to each gt. Defaults to 9.
+        soft_weight_on_reg (bool): Whether to use the soft target of the
+            cls branch as the soft weight of the bbox branch.
+            Defaults to False.
+        loss_cls (:obj:`ConfigDict` or dict): Config of cls loss. Defaults to
+            dict(type='GaussianFocalLoss', loss_weight=1.0)
+        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox loss. Defaults to
+             dict(type='GIoULoss', loss_weight=2.0).
+        norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct
+            and config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Unused in CenterNet. Reserved for compatibility with
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config
+            of CenterNet.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((0, 80), (64, 160), (128, 320),
+                                              (256, 640), (512, INF)),
+                 hm_min_radius: int = 4,
+                 hm_min_overlap: float = 0.8,
+                 more_pos_thresh: float = 0.2,
+                 more_pos_topk: int = 9,
+                 soft_weight_on_reg: bool = False,
+                 loss_cls: ConfigType = dict(
+                     type='GaussianFocalLoss',
+                     pos_weight=0.25,
+                     neg_weight=0.75,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='GIoULoss', loss_weight=2.0),
+                 norm_cfg: OptConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            **kwargs)
+        self.soft_weight_on_reg = soft_weight_on_reg
+        self.hm_min_radius = hm_min_radius
+        self.more_pos_thresh = more_pos_thresh
+        self.more_pos_topk = more_pos_topk
+        self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap)
+        self.sigmoid_clamp = 0.0001
+
+        # GaussianFocalLoss must be sigmoid mode
+        self.use_sigmoid_cls = True
+        self.cls_out_channels = num_classes
+
+        self.regress_ranges = regress_ranges
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def _init_predictor(self) -> None:
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.num_classes, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of each level outputs.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each \
+            scale level, each is a 4D-tensor, the channel number is 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales, self.strides)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps.
+
+        Returns:
+            tuple: scores for each class, bbox predictions of
+            input feature maps.
+        """
+        feat = self.reg_convs(x)
+        cls_score = self.conv_cls()
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        if not self.training:
+            bbox_pred *= stride
+        return cls_score, bbox_pred
\ No newline at end of file

From f7bd1bb8d903803b761962f4c6d06cec01a9b7e9 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Tue, 17 Jan 2023 17:11:43 +0800
Subject: [PATCH 02/12] add model

---
 .../Detic/configs/detic_centernet2_r50_fpn.py |  39 +-
 projects/Detic/detic/__init__.py              |   4 +
 projects/Detic/detic/centernet_rpn_head.py    |  15 +-
 projects/Detic/detic/detic_roi_head.py        | 373 ++++++++++++++++++
 projects/Detic/detic/zero_shot_classifier.py  |  73 ++++
 5 files changed, 486 insertions(+), 18 deletions(-)
 create mode 100644 projects/Detic/detic/__init__.py
 create mode 100644 projects/Detic/detic/detic_roi_head.py
 create mode 100644 projects/Detic/detic/zero_shot_classifier.py

diff --git a/projects/Detic/configs/detic_centernet2_r50_fpn.py b/projects/Detic/configs/detic_centernet2_r50_fpn.py
index 4d9e75e3c63..4582afbfb99 100644
--- a/projects/Detic/configs/detic_centernet2_r50_fpn.py
+++ b/projects/Detic/configs/detic_centernet2_r50_fpn.py
@@ -1,8 +1,24 @@
 _base_ = 'mmdet::common/lsj-200e_coco-detection.py'
 
+custom_imports = dict(
+    imports=['projects.Detic.detic'], allow_failed_imports=False)
+
 image_size = (1024, 1024)
 batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
 
+cls_layer = dict(
+    type='ZeroShotClassifier',
+    zs_weight_path='rand',
+    zs_weight_dim=512,
+    use_bias=0.0,
+    norm_weight=True,
+    norm_temperature=50.0)
+reg_layer = [
+    dict(type='Linear', in_features=256, out_features=256),
+    dict(type='ReLU', inplace=True),
+    dict(type='Linear', in_features=256, out_features=4)
+]
+
 model = dict(
     type='CascadeRCNN',
     data_preprocessor=dict(
@@ -32,13 +48,14 @@
         init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'),
         relu_before_extra_convs=True),
     rpn_head=dict(
-        type='CenterNetUpdateHead',
+        type='CenterNetRPNHead',
         num_classes=80,
         in_channels=256,
         stacked_convs=4,
         feat_channels=256,
         strides=[8, 16, 32, 64, 128],
-        norm_cfg=None,
+        conv_bias=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
         loss_cls=dict(
             type='GaussianFocalLoss',
             pos_weight=0.25,
@@ -47,7 +64,7 @@
         loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
     ),
     roi_head=dict(
-        type='CascadeRoIHead',
+        type='DeticRoIHead',
         num_stages=3,
         stage_loss_weights=[1, 0.5, 0.25],
         bbox_roi_extractor=dict(
@@ -62,9 +79,8 @@
                 fc_out_channels=1024,
                 roi_feat_size=7,
                 num_classes=80,
-                reg_predictor_cfg=[dict(type='Linear', in_features=256, out_features=256),
-                                   dict(type='ReLU', inplace=True),
-                                   dict(type='Linear', in_features=256, out_features=4)],
+                cls_predictor_cfg=cls_layer,
+                reg_predictor_cfg=reg_layer,
                 bbox_coder=dict(
                     type='DeltaXYWHBBoxCoder',
                     target_means=[0., 0., 0., 0.],
@@ -82,9 +98,8 @@
                 fc_out_channels=1024,
                 roi_feat_size=7,
                 num_classes=80,
-                reg_predictor_cfg=[dict(type='Linear', in_features=256, out_features=256),
-                                   dict(type='ReLU', inplace=True),
-                                   dict(type='Linear', in_features=256, out_features=4)],
+                cls_predictor_cfg=cls_layer,
+                reg_predictor_cfg=reg_layer,
                 bbox_coder=dict(
                     type='DeltaXYWHBBoxCoder',
                     target_means=[0., 0., 0., 0.],
@@ -102,9 +117,8 @@
                 fc_out_channels=1024,
                 roi_feat_size=7,
                 num_classes=80,
-                reg_predictor_cfg=[dict(type='Linear', in_features=256, out_features=256),
-                                   dict(type='ReLU', inplace=True),
-                                   dict(type='Linear', in_features=256, out_features=4)],
+                cls_predictor_cfg=cls_layer,
+                reg_predictor_cfg=reg_layer,
                 bbox_coder=dict(
                     type='DeltaXYWHBBoxCoder',
                     target_means=[0., 0., 0., 0.],
@@ -218,7 +232,6 @@
             max_per_img=100,
             mask_thr_binary=0.5)))
 
-
 train_dataloader = dict(batch_size=8, num_workers=4)
 # Enable automatic-mixed-precision training with AmpOptimWrapper.
 optim_wrapper = dict(
diff --git a/projects/Detic/detic/__init__.py b/projects/Detic/detic/__init__.py
new file mode 100644
index 00000000000..51d35020e82
--- /dev/null
+++ b/projects/Detic/detic/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .centernet_rpn_head import CenterNetRPNHead
+from .detic_roi_head import DeticRoIHead
+from .zero_shot_classifier import ZeroShotClassifier
diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py
index 724df9f1bad..3db9c7a6006 100644
--- a/projects/Detic/detic/centernet_rpn_head.py
+++ b/projects/Detic/detic/centernet_rpn_head.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.models.dense_heads import CenterNetUpdateHead
-from mmdet.registry import MODELS
 from typing import Dict, List, Optional, Sequence, Tuple
 
 import torch
@@ -9,11 +7,12 @@
 from mmengine.structures import InstanceData
 from torch import Tensor
 
+from mmdet.models.dense_heads import CenterNetUpdateHead
+from mmdet.models.utils import multi_apply
 from mmdet.registry import MODELS
 from mmdet.structures.bbox import bbox2distance
 from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
                          OptInstanceList, reduce_mean)
-from mmdet.models.utils import multi_apply
 
 INF = 1000000000
 RangeType = Sequence[Tuple[int, int]]
@@ -100,6 +99,11 @@ def __init__(self,
         self.regress_ranges = regress_ranges
         self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
 
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self._init_reg_convs()
+        self._init_predictor()
+
     def _init_predictor(self) -> None:
         """Initialize predictor layers of the head."""
         self.conv_cls = nn.Conv2d(
@@ -138,7 +142,8 @@ def forward_single(self, x: Tensor, scale: Scale,
             input feature maps.
         """
         feat = self.reg_convs(x)
-        cls_score = self.conv_cls()
+        cls_score = self.conv_cls(feat)
+        bbox_pred = self.reg_convs(feat)
         # scale the bbox_pred of different level
         # float to avoid overflow when enabling FP16
         bbox_pred = scale(bbox_pred).float()
@@ -148,4 +153,4 @@ def forward_single(self, x: Tensor, scale: Scale,
         bbox_pred = bbox_pred.clamp(min=0)
         if not self.training:
             bbox_pred *= stride
-        return cls_score, bbox_pred
\ No newline at end of file
+        return cls_score, bbox_pred
diff --git a/projects/Detic/detic/detic_roi_head.py b/projects/Detic/detic/detic_roi_head.py
new file mode 100644
index 00000000000..b31253e8773
--- /dev/null
+++ b/projects/Detic/detic/detic_roi_head.py
@@ -0,0 +1,373 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.roi_heads import CascadeRoIHead
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.test_time_augs import merge_aug_masks
+from mmdet.models.utils.misc import empty_instances, unpack_gt_instances
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi, get_box_tensor
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptMultiConfig)
+
+
+@MODELS.register_module()
+class DeticRoIHead(CascadeRoIHead):
+
+    def init_mask_head(self, mask_roi_extractor: MultiConfig,
+                       mask_head: MultiConfig) -> None:
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_head (dict): Config of mask in mask head.
+            mask_roi_extractor (:obj:`ConfigDict`, dict or list):
+                Config of mask roi extractor.
+        """
+        self.mask_head = MODELS.build(mask_head)
+
+        if mask_roi_extractor is not None:
+            self.share_roi_extractor = False
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+
+    def _bbox_forward(self, stage: int, x: Tuple[Tensor],
+                      rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False,
+                     **kwargs) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        proposal_scores = [res.scores for res in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='bbox',
+                box_type=self.bbox_head[-1].predict_box_type,
+                num_classes=self.bbox_head[-1].num_classes,
+                score_per_cls=rcnn_test_cfg is None)
+
+        rois, cls_scores, bbox_preds = self._refine_roi(
+            x=x,
+            rois=rois,
+            batch_img_metas=batch_img_metas,
+            num_proposals_per_img=num_proposals_per_img,
+            **kwargs)
+
+        # TODO
+        self.mult_proposal_score = True
+        self.one_class_per_proposal = True
+
+        # centernet2
+        if self.mult_proposal_score:  # True
+            cls_scores = [(s * ps[:, None]) ** 0.5 \
+                      for s, ps in zip(cls_scores, proposal_scores)]
+        if self.one_class_per_proposal:  # True
+            cls_scores = [
+                s * (s == s[:, :-1].max(dim=1)[0][:, None]).float()
+                for s in cls_scores
+            ]
+
+        # fast_rcnn_inference
+        results_list = self.bbox_head[-1].predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale,
+            rcnn_test_cfg=rcnn_test_cfg)
+        return results_list
+
+    def bbox_loss(self, stage: int, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(stage, x, rois)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+        bbox_results.update(bbox_loss_and_target)
+
+        return bbox_results
+
+    def _mask_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_feats = self.mask_roi_extractor(
+            x[:self.mask_roi_extractor.num_inputs], rois)
+        # do not support caffe_c4 model anymore
+        mask_preds = self.mask_head(mask_feats)
+
+        mask_results = dict(mask_preds=mask_preds)
+        return mask_results
+
+    def mask_loss(self, x, sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(x, pos_rois)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[-1])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        # TODO: May add a new function in baseroihead
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        num_imgs = len(batch_data_samples)
+        losses = dict()
+        results_list = rpn_results_list
+        for stage in range(self.num_stages):
+            self.current_stage = stage
+
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            if self.with_bbox or self.with_mask:
+                bbox_assigner = self.bbox_assigner[stage]
+                bbox_sampler = self.bbox_sampler[stage]
+
+                for i in range(num_imgs):
+                    results = results_list[i]
+                    # rename rpn_results.bboxes to rpn_results.priors
+                    results.priors = results.pop('bboxes')
+
+                    assign_result = bbox_assigner.assign(
+                        results, batch_gt_instances[i],
+                        batch_gt_instances_ignore[i])
+
+                    sampling_result = bbox_sampler.sample(
+                        assign_result,
+                        results,
+                        batch_gt_instances[i],
+                        feats=[lvl_feat[i][None] for lvl_feat in x])
+                    sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(stage, x, sampling_results)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # mask head forward and loss
+            # TODO: only last stage
+            if self.with_mask:
+                mask_results = self.mask_loss(stage, x, sampling_results,
+                                              batch_gt_instances)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results, bbox_results, batch_img_metas)
+                    # Empty proposal
+                    if results_list is None:
+                        break
+        return losses
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: List[InstanceData],
+                     rescale: bool = False) -> List[InstanceData]:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        aug_masks = []
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        # split batch mask prediction back to each image
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+        aug_masks.append([m.sigmoid().detach() for m in mask_preds])
+
+        merged_masks = []
+        for i in range(len(batch_img_metas)):
+            aug_mask = [mask[i] for mask in aug_masks]
+            merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+            merged_masks.append(merged_mask)
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=merged_masks,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale,
+            activate_map=True)
+        return results_list
diff --git a/projects/Detic/detic/zero_shot_classifier.py b/projects/Detic/detic/zero_shot_classifier.py
new file mode 100644
index 00000000000..8fbb886b0f5
--- /dev/null
+++ b/projects/Detic/detic/zero_shot_classifier.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class ZeroShotClassifier(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,  # num_classes
+        zs_weight_path: str,
+        zs_weight_dim: int = 512,
+        use_bias: float = 0.0,
+        norm_weight: bool = True,
+        norm_temperature: float = 50.0,
+    ):
+        super().__init__()
+        num_classes = out_features
+        self.norm_weight = norm_weight
+        self.norm_temperature = norm_temperature
+
+        self.use_bias = use_bias < 0
+        if self.use_bias:
+            self.cls_bias = nn.Parameter(torch.ones(1) * use_bias)
+
+        self.linear = nn.Linear(in_features, zs_weight_dim)
+
+        if zs_weight_path == 'rand':
+            zs_weight = torch.randn((zs_weight_dim, num_classes))
+            nn.init.normal_(zs_weight, std=0.01)
+        else:
+            zs_weight = torch.tensor(
+                np.load(zs_weight_path),
+                dtype=torch.float32).permute(1, 0).contiguous()  # D x C
+        zs_weight = torch.cat(
+            [zs_weight, zs_weight.new_zeros(
+                (zs_weight_dim, 1))], dim=1)  # D x (C + 1)
+
+        if self.norm_weight:
+            zs_weight = F.normalize(zs_weight, p=2, dim=0)
+
+        if zs_weight_path == 'rand':
+            self.zs_weight = nn.Parameter(zs_weight)
+        else:
+            self.register_buffer('zs_weight', zs_weight)
+
+        assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape
+
+    def forward(self, x, classifier=None):
+        '''
+        Inputs:
+            x: B x D'
+            classifier_info: (C', C' x D)
+        '''
+        x = self.linear(x)
+        if classifier is not None:
+            zs_weight = classifier.permute(1, 0).contiguous()  # D x C'
+            zs_weight = F.normalize(zs_weight, p=2, dim=0) \
+                if self.norm_weight else zs_weight
+        else:
+            zs_weight = self.zs_weight
+        if self.norm_weight:
+            x = self.norm_temperature * F.normalize(x, p=2, dim=1)
+        x = torch.mm(x, zs_weight)
+        if self.use_bias:
+            x = x + self.cls_bias
+        return x

From d078372bcbd54e0ff7af9dd3a5ecb7e7097fd760 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Tue, 17 Jan 2023 20:05:30 +0800
Subject: [PATCH 03/12] [Projects] Support Detic inference.

---
 .../Detic/configs/detic_centernet2_r50_fpn.py | 68 ++++++++++---------
 projects/Detic/detic/centernet_rpn_head.py    |  7 +-
 2 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/projects/Detic/configs/detic_centernet2_r50_fpn.py b/projects/Detic/configs/detic_centernet2_r50_fpn.py
index 4582afbfb99..87a450df1d4 100644
--- a/projects/Detic/configs/detic_centernet2_r50_fpn.py
+++ b/projects/Detic/configs/detic_centernet2_r50_fpn.py
@@ -14,13 +14,20 @@
     norm_weight=True,
     norm_temperature=50.0)
 reg_layer = [
-    dict(type='Linear', in_features=256, out_features=256),
+    dict(type='Linear', in_features=1024, out_features=1024),
     dict(type='ReLU', inplace=True),
-    dict(type='Linear', in_features=256, out_features=4)
+    dict(type='Linear', in_features=1024, out_features=4)
 ]
 
+num_classes = 1203 - 1
+
 model = dict(
     type='CascadeRCNN',
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=
+        '/home/rangilyu/projects/Detic/models/Detic_LbaseI_CLIP_R5021k_640b64_4x_ft4x_max-size_mmdet.pth'
+    ),
     data_preprocessor=dict(
         type='DetDataPreprocessor',
         mean=[123.675, 116.28, 103.53],
@@ -32,7 +39,7 @@
         type='ResNet',
         depth=50,
         num_stages=4,
-        out_indices=(0, 1, 2, 3),
+        out_indices=(1, 2, 3),
         frozen_stages=1,
         norm_cfg=dict(type='BN', requires_grad=True),
         norm_eval=True,
@@ -40,16 +47,16 @@
         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
     neck=dict(
         type='FPN',
-        in_channels=[256, 512, 1024, 2048],
+        in_channels=[512, 1024, 2048],
         out_channels=256,
-        start_level=1,
+        start_level=0,
         add_extra_convs='on_output',
         num_outs=5,
         init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'),
         relu_before_extra_convs=True),
     rpn_head=dict(
         type='CenterNetRPNHead',
-        num_classes=80,
+        num_classes=1,
         in_channels=256,
         stacked_convs=4,
         feat_channels=256,
@@ -71,14 +78,14 @@
             type='SingleRoIExtractor',
             roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
-            featmap_strides=[4, 8, 16, 32]),
+            featmap_strides=[8, 16, 32]),
         bbox_head=[
             dict(
                 type='Shared2FCBBoxHead',
                 in_channels=256,
                 fc_out_channels=1024,
                 roi_feat_size=7,
-                num_classes=80,
+                num_classes=num_classes,
                 cls_predictor_cfg=cls_layer,
                 reg_predictor_cfg=reg_layer,
                 bbox_coder=dict(
@@ -87,8 +94,7 @@
                     target_stds=[0.1, 0.1, 0.2, 0.2]),
                 reg_class_agnostic=True,
                 loss_cls=dict(
-                    type='CrossEntropyLoss',
-                    use_sigmoid=False,
+                    type='CrossEntropyLoss', use_sigmoid=True,
                     loss_weight=1.0),
                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                                loss_weight=1.0)),
@@ -97,7 +103,7 @@
                 in_channels=256,
                 fc_out_channels=1024,
                 roi_feat_size=7,
-                num_classes=80,
+                num_classes=num_classes,
                 cls_predictor_cfg=cls_layer,
                 reg_predictor_cfg=reg_layer,
                 bbox_coder=dict(
@@ -106,8 +112,7 @@
                     target_stds=[0.05, 0.05, 0.1, 0.1]),
                 reg_class_agnostic=True,
                 loss_cls=dict(
-                    type='CrossEntropyLoss',
-                    use_sigmoid=False,
+                    type='CrossEntropyLoss', use_sigmoid=True,
                     loss_weight=1.0),
                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                                loss_weight=1.0)),
@@ -116,7 +121,7 @@
                 in_channels=256,
                 fc_out_channels=1024,
                 roi_feat_size=7,
-                num_classes=80,
+                num_classes=num_classes,
                 cls_predictor_cfg=cls_layer,
                 reg_predictor_cfg=reg_layer,
                 bbox_coder=dict(
@@ -125,8 +130,7 @@
                     target_stds=[0.033, 0.033, 0.067, 0.067]),
                 reg_class_agnostic=True,
                 loss_cls=dict(
-                    type='CrossEntropyLoss',
-                    use_sigmoid=False,
+                    type='CrossEntropyLoss', use_sigmoid=True,
                     loss_weight=1.0),
                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
         ],
@@ -134,13 +138,14 @@
             type='SingleRoIExtractor',
             roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=256,
-            featmap_strides=[4, 8, 16, 32]),
+            featmap_strides=[8, 16, 32]),
         mask_head=dict(
             type='FCNMaskHead',
             num_convs=4,
             in_channels=256,
             conv_out_channels=256,
-            num_classes=80,
+            class_agnostic=True,
+            num_classes=num_classes,
             loss_mask=dict(
                 type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
     # model training and testing settings
@@ -171,9 +176,9 @@
             dict(
                 assigner=dict(
                     type='MaxIoUAssigner',
-                    pos_iou_thr=0.5,
-                    neg_iou_thr=0.5,
-                    min_pos_iou=0.5,
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
                     match_low_quality=False,
                     ignore_iof_thr=-1),
                 sampler=dict(
@@ -188,9 +193,9 @@
             dict(
                 assigner=dict(
                     type='MaxIoUAssigner',
-                    pos_iou_thr=0.6,
-                    neg_iou_thr=0.6,
-                    min_pos_iou=0.6,
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
                     match_low_quality=False,
                     ignore_iof_thr=-1),
                 sampler=dict(
@@ -205,9 +210,9 @@
             dict(
                 assigner=dict(
                     type='MaxIoUAssigner',
-                    pos_iou_thr=0.7,
-                    neg_iou_thr=0.7,
-                    min_pos_iou=0.7,
+                    pos_iou_thr=0.8,
+                    neg_iou_thr=0.8,
+                    min_pos_iou=0.8,
                     match_low_quality=False,
                     ignore_iof_thr=-1),
                 sampler=dict(
@@ -222,13 +227,14 @@
         ]),
     test_cfg=dict(
         rpn=dict(
+            score_thr=0.0001,
             nms_pre=1000,
-            max_per_img=1000,
-            nms=dict(type='nms', iou_threshold=0.7),
+            max_per_img=256,
+            nms=dict(type='nms', iou_threshold=0.9),
             min_bbox_size=0),
         rcnn=dict(
-            score_thr=0.05,
-            nms=dict(type='nms', iou_threshold=0.5),
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.6),
             max_per_img=100,
             mask_thr_binary=0.5)))
 
diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py
index 3db9c7a6006..8a1e3a2e8a2 100644
--- a/projects/Detic/detic/centernet_rpn_head.py
+++ b/projects/Detic/detic/centernet_rpn_head.py
@@ -141,9 +141,10 @@ def forward_single(self, x: Tensor, scale: Scale,
             tuple: scores for each class, bbox predictions of
             input feature maps.
         """
-        feat = self.reg_convs(x)
-        cls_score = self.conv_cls(feat)
-        bbox_pred = self.reg_convs(feat)
+        for m in self.reg_convs:
+            x = m(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
         # scale the bbox_pred of different level
         # float to avoid overflow when enabling FP16
         bbox_pred = scale(bbox_pred).float()

From f2759d859095338b817350b5a6b9a7a5bb6cf217 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Wed, 18 Jan 2023 16:26:20 +0800
Subject: [PATCH 04/12] align RPN inference

---
 .../Detic/configs/detic_centernet2_r50_fpn.py |  26 ++++-
 projects/Detic/detic/centernet_rpn_head.py    |  67 +++++++++++-
 projects/Detic/detic/detic_roi_head.py        | 100 +-----------------
 3 files changed, 89 insertions(+), 104 deletions(-)

diff --git a/projects/Detic/configs/detic_centernet2_r50_fpn.py b/projects/Detic/configs/detic_centernet2_r50_fpn.py
index 87a450df1d4..602ca0f3635 100644
--- a/projects/Detic/configs/detic_centernet2_r50_fpn.py
+++ b/projects/Detic/configs/detic_centernet2_r50_fpn.py
@@ -233,12 +233,32 @@
             nms=dict(type='nms', iou_threshold=0.9),
             min_bbox_size=0),
         rcnn=dict(
-            score_thr=0.02,
-            nms=dict(type='nms', iou_threshold=0.6),
-            max_per_img=100,
+            score_thr=0.5,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=300,
             mask_thr_binary=0.5)))
 
+backend = 'pillow'
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        file_client_args=_base_.file_client_args,
+        imdecode_backend=backend),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
 train_dataloader = dict(batch_size=8, num_workers=4)
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
 # Enable automatic-mixed-precision training with AmpOptimWrapper.
 optim_wrapper = dict(
     type='AmpOptimWrapper',
diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py
index 8a1e3a2e8a2..d4936cfc88c 100644
--- a/projects/Detic/detic/centernet_rpn_head.py
+++ b/projects/Detic/detic/centernet_rpn_head.py
@@ -4,13 +4,16 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import Scale
+from mmcv.ops import batched_nms
+from mmengine import ConfigDict
 from mmengine.structures import InstanceData
 from torch import Tensor
 
 from mmdet.models.dense_heads import CenterNetUpdateHead
 from mmdet.models.utils import multi_apply
 from mmdet.registry import MODELS
-from mmdet.structures.bbox import bbox2distance
+from mmdet.structures.bbox import (bbox2distance, cat_boxes, get_box_tensor,
+                                   get_box_wh, scale_boxes)
 from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
                          OptInstanceList, reduce_mean)
 
@@ -154,4 +157,64 @@ def forward_single(self, x: Tensor, scale: Scale,
         bbox_pred = bbox_pred.clamp(min=0)
         if not self.training:
             bbox_pred *= stride
-        return cls_score, bbox_pred
+        return cls_score, bbox_pred  # score aligned, box larger
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        # apply sqrt when `with_agn_hm=True` in Detic
+        results.scores = torch.sqrt(results.scores)  # TODO: train
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        if with_nms and results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+
+        return results
diff --git a/projects/Detic/detic/detic_roi_head.py b/projects/Detic/detic/detic_roi_head.py
index b31253e8773..73a390e3ed9 100644
--- a/projects/Detic/detic/detic_roi_head.py
+++ b/projects/Detic/detic/detic_roi_head.py
@@ -143,43 +143,6 @@ def predict_bbox(self,
             rcnn_test_cfg=rcnn_test_cfg)
         return results_list
 
-    def bbox_loss(self, stage: int, x: Tuple[Tensor],
-                  sampling_results: List[SamplingResult]) -> dict:
-        """Run forward function and calculate loss for box head in training.
-
-        Args:
-            stage (int): The current stage in Cascade RoI Head.
-            x (tuple[Tensor]): List of multi-level img features.
-            sampling_results (list["obj:`SamplingResult`]): Sampling results.
-
-        Returns:
-            dict: Usually returns a dictionary with keys:
-
-                - `cls_score` (Tensor): Classification scores.
-                - `bbox_pred` (Tensor): Box energies / deltas.
-                - `bbox_feats` (Tensor): Extract bbox RoI features.
-                - `loss_bbox` (dict): A dictionary of bbox loss components.
-                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
-                  column indicates batch id of each RoI.
-                - `bbox_targets` (tuple):  Ground truth for proposals in a
-                  single image. Containing the following list of Tensors:
-                  (labels, label_weights, bbox_targets, bbox_weights)
-        """
-        bbox_head = self.bbox_head[stage]
-        rois = bbox2roi([res.priors for res in sampling_results])
-        bbox_results = self._bbox_forward(stage, x, rois)
-        bbox_results.update(rois=rois)
-
-        bbox_loss_and_target = bbox_head.loss_and_target(
-            cls_score=bbox_results['cls_score'],
-            bbox_pred=bbox_results['bbox_pred'],
-            rois=rois,
-            sampling_results=sampling_results,
-            rcnn_train_cfg=self.train_cfg[stage])
-        bbox_results.update(bbox_loss_and_target)
-
-        return bbox_results
-
     def _mask_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
         """Mask head forward function used in both training and testing.
 
@@ -247,68 +210,7 @@ def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
         Returns:
             dict[str, Tensor]: A dictionary of loss components
         """
-        # TODO: May add a new function in baseroihead
-        assert len(rpn_results_list) == len(batch_data_samples)
-        outputs = unpack_gt_instances(batch_data_samples)
-        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
-            = outputs
-
-        num_imgs = len(batch_data_samples)
-        losses = dict()
-        results_list = rpn_results_list
-        for stage in range(self.num_stages):
-            self.current_stage = stage
-
-            stage_loss_weight = self.stage_loss_weights[stage]
-
-            # assign gts and sample proposals
-            sampling_results = []
-            if self.with_bbox or self.with_mask:
-                bbox_assigner = self.bbox_assigner[stage]
-                bbox_sampler = self.bbox_sampler[stage]
-
-                for i in range(num_imgs):
-                    results = results_list[i]
-                    # rename rpn_results.bboxes to rpn_results.priors
-                    results.priors = results.pop('bboxes')
-
-                    assign_result = bbox_assigner.assign(
-                        results, batch_gt_instances[i],
-                        batch_gt_instances_ignore[i])
-
-                    sampling_result = bbox_sampler.sample(
-                        assign_result,
-                        results,
-                        batch_gt_instances[i],
-                        feats=[lvl_feat[i][None] for lvl_feat in x])
-                    sampling_results.append(sampling_result)
-
-            # bbox head forward and loss
-            bbox_results = self.bbox_loss(stage, x, sampling_results)
-
-            for name, value in bbox_results['loss_bbox'].items():
-                losses[f's{stage}.{name}'] = (
-                    value * stage_loss_weight if 'loss' in name else value)
-
-            # mask head forward and loss
-            # TODO: only last stage
-            if self.with_mask:
-                mask_results = self.mask_loss(stage, x, sampling_results,
-                                              batch_gt_instances)
-                for name, value in mask_results['loss_mask'].items():
-                    losses[f's{stage}.{name}'] = (
-                        value * stage_loss_weight if 'loss' in name else value)
-
-            # refine bboxes
-            if stage < self.num_stages - 1:
-                bbox_head = self.bbox_head[stage]
-                with torch.no_grad():
-                    results_list = bbox_head.refine_bboxes(
-                        sampling_results, bbox_results, batch_img_metas)
-                    # Empty proposal
-                    if results_list is None:
-                        break
-        return losses
+        raise NotImplementedError
 
     def predict_mask(self,
                      x: Tuple[Tensor],

From f96700f230b49999716321cb7991947889b0b4e8 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Wed, 18 Jan 2023 19:51:42 +0800
Subject: [PATCH 05/12] rpn

---
 projects/Detic/detic/centernet_rpn_head.py | 160 +++++++++++++++------
 projects/Detic/detic/detic_roi_head.py     |  62 +++++++-
 2 files changed, 177 insertions(+), 45 deletions(-)

diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py
index d4936cfc88c..7980c5d37b2 100644
--- a/projects/Detic/detic/centernet_rpn_head.py
+++ b/projects/Detic/detic/centernet_rpn_head.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Dict, List, Optional, Sequence, Tuple
-
+import copy
 import torch
 import torch.nn as nn
 from mmcv.cnn import Scale
@@ -10,7 +10,7 @@
 from torch import Tensor
 
 from mmdet.models.dense_heads import CenterNetUpdateHead
-from mmdet.models.utils import multi_apply
+from mmdet.models.utils import multi_apply, filter_scores_and_topk
 from mmdet.registry import MODELS
 from mmdet.structures.bbox import (bbox2distance, cat_boxes, get_box_tensor,
                                    get_box_wh, scale_boxes)
@@ -128,7 +128,8 @@ def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
             - bbox_preds (list[Tensor]): Box energies / deltas for each \
             scale level, each is a 4D-tensor, the channel number is 4.
         """
-        return multi_apply(self.forward_single, x, self.scales, self.strides)
+        res = multi_apply(self.forward_single, x, self.scales, self.strides)
+        return res
 
     def forward_single(self, x: Tensor, scale: Scale,
                        stride: int) -> Tuple[Tensor, Tensor]:
@@ -159,27 +160,41 @@ def forward_single(self, x: Tensor, scale: Scale,
             bbox_pred *= stride
         return cls_score, bbox_pred  # score aligned, box larger
 
-    def _bbox_post_process(self,
-                           results: InstanceData,
-                           cfg: ConfigDict,
-                           rescale: bool = False,
-                           with_nms: bool = True,
-                           img_meta: Optional[dict] = None) -> InstanceData:
-        """bbox post-processing method.
-
-        The boxes would be rescaled to the original image scale and do
-        the nms operation. Usually `with_nms` is False is used for aug test.
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
 
         Args:
-            results (:obj:`InstaceData`): Detection instance results,
-                each item has shape (num_bboxes, ).
-            cfg (ConfigDict): Test / postprocessing configuration,
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
                 if None, test_cfg would be used.
             rescale (bool): If True, return boxes in original image space.
-                Default to False.
+                Defaults to False.
             with_nms (bool): If True, do nms before return boxes.
-                Default to True.
-            img_meta (dict, optional): Image meta info. Defaults to None.
+                Defaults to True.
 
         Returns:
             :obj:`InstanceData`: Detection results of each image
@@ -193,28 +208,85 @@ def _bbox_post_process(self,
                 - bboxes (Tensor): Has a shape (num_instances, 4),
                   the last dimension 4 arrange as (x1, y1, x2, y2).
         """
-        # apply sqrt when `with_agn_hm=True` in Detic
-        results.scores = torch.sqrt(results.scores)  # TODO: train
-
-        if rescale:
-            assert img_meta.get('scale_factor') is not None
-            scale_factor = [1 / s for s in img_meta['scale_factor']]
-            results.bboxes = scale_boxes(results.bboxes, scale_factor)
-
-        # filter small size bboxes
-        if cfg.get('min_bbox_size', -1) >= 0:
-            w, h = get_box_wh(results.bboxes)
-            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
-            if not valid_mask.all():
-                results = results[valid_mask]
-
-        if with_nms and results.bboxes.numel() > 0:
-            bboxes = get_box_tensor(results.bboxes)
-            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
-                                                results.labels, cfg.nms)
-            results = results[keep_idxs]
-            # some nms would reweight the score, such as softnms
-            results.scores = det_bboxes[:, -1]
-            results = results[:cfg.max_per_img]
-
-        return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            heatmap = cls_score.sigmoid()
+            score_thr = cfg.get('score_thr', 0)
+
+            candidate_inds = heatmap > score_thr  # 0.05
+            pre_nms_top_n = candidate_inds.sum()  # N
+            pre_nms_top_n = pre_nms_top_n.clamp(max=nms_pre)  # N
+
+            per_box_cls = heatmap  # HW x C
+            per_candidate_inds = candidate_inds  # n
+            per_box_cls = per_box_cls[per_candidate_inds]  # n
+
+            per_candidate_nonzeros = per_candidate_inds.nonzero()  # n
+            per_box_loc = per_candidate_nonzeros[:, 0]  # n
+            per_class = per_candidate_nonzeros[:, 1]  # n
+
+            per_box_regression = bbox_pred  # HW x 4
+            per_box_regression = per_box_regression[per_box_loc]  # n x 4
+            per_grids = priors[per_box_loc]  # n x 2
+
+            per_pre_nms_top_n = pre_nms_top_n
+
+            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
+                per_box_cls, top_k_indices = \
+                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
+                per_class = per_class[top_k_indices]
+                per_box_regression = per_box_regression[top_k_indices]
+                per_grids = per_grids[top_k_indices]
+
+            # TODO: replace with box coder
+            detections = torch.stack([
+                per_grids[:, 0] - per_box_regression[:, 0],
+                per_grids[:, 1] - per_box_regression[:, 1],
+                per_grids[:, 0] + per_box_regression[:, 2],
+                per_grids[:, 1] + per_box_regression[:, 3],
+            ], dim=1)  # n x 4
+
+            # avoid invalid boxes in RoI heads
+            detections[:, 2] = torch.max(detections[:, 2], detections[:, 0] + 0.01)
+            detections[:, 3] = torch.max(detections[:, 3], detections[:, 1] + 0.01)
+
+
+            mlvl_bbox_preds.append(detections)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(torch.sqrt(per_box_cls))
+            mlvl_labels.append(per_class)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        # priors = cat_boxes(mlvl_valid_priors)
+        # bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bbox_pred
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
\ No newline at end of file
diff --git a/projects/Detic/detic/detic_roi_head.py b/projects/Detic/detic/detic_roi_head.py
index 73a390e3ed9..73273711b86 100644
--- a/projects/Detic/detic/detic_roi_head.py
+++ b/projects/Detic/detic/detic_roi_head.py
@@ -39,6 +39,66 @@ def init_mask_head(self, mask_roi_extractor: MultiConfig,
             self.share_roi_extractor = True
             self.mask_roi_extractor = self.bbox_roi_extractor
 
+    def _refine_roi(self, x: Tuple[Tensor], rois: Tensor,
+                    batch_img_metas: List[dict],
+                    num_proposals_per_img: Sequence[int], **kwargs) -> tuple:
+        """Multi-stage refinement of RoI.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2]
+            batch_img_metas (list[dict]): List of image information.
+            num_proposals_per_img (sequence[int]): number of proposals
+                in each image.
+
+        Returns:
+            tuple:
+
+               - rois (Tensor): Refined RoI.
+               - cls_scores (list[Tensor]): Average predicted
+                   cls score per image.
+               - bbox_preds (list[Tensor]): Bbox branch predictions
+                   for the last stage of per image.
+        """
+        # "ms" in variable names means multi-stage
+        ms_scores = []
+        for stage in range(self.num_stages):
+            bbox_results = self._bbox_forward(
+                stage=stage, x=x, rois=rois, **kwargs)
+
+            # split batch bbox prediction back to each image
+            cls_scores = bbox_results['cls_score'].sigmoid()
+            bbox_preds = bbox_results['bbox_pred']
+
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_scores = cls_scores.split(num_proposals_per_img, 0)
+            ms_scores.append(cls_scores)
+            bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                refine_rois_list = []
+                for i in range(len(batch_img_metas)):
+                    if rois[i].shape[0] > 0:
+                        bbox_label = cls_scores[i][:, :-1].argmax(dim=1)
+                        # Refactor `bbox_head.regress_by_class` to only accept
+                        # box tensor without img_idx concatenated.
+                        refined_bboxes = bbox_head.regress_by_class(
+                            rois[i][:, 1:], bbox_label, bbox_preds[i],
+                            batch_img_metas[i])
+                        refined_bboxes = get_box_tensor(refined_bboxes)
+                        refined_rois = torch.cat(
+                            [rois[i][:, [0]], refined_bboxes], dim=1)
+                        refine_rois_list.append(refined_rois)
+                rois = torch.cat(refine_rois_list)
+        # ms_scores aligned
+        # average scores of each image by stages
+        cls_scores = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(len(batch_img_metas))
+        ]  # aligned
+        return rois, cls_scores, bbox_preds
+
     def _bbox_forward(self, stage: int, x: Tuple[Tensor],
                       rois: Tensor) -> dict:
         """Box head forward function used in both training and testing.
@@ -111,7 +171,7 @@ def predict_bbox(self,
                 box_type=self.bbox_head[-1].predict_box_type,
                 num_classes=self.bbox_head[-1].num_classes,
                 score_per_cls=rcnn_test_cfg is None)
-
+        # rois aligned
         rois, cls_scores, bbox_preds = self._refine_roi(
             x=x,
             rois=rois,

From 7ebcda1171bd9639ec1d8d49b4836546ba394180 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Thu, 19 Jan 2023 13:14:34 +0800
Subject: [PATCH 06/12] roi align

---
 projects/Detic/configs/detic_centernet2_r50_fpn.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/projects/Detic/configs/detic_centernet2_r50_fpn.py b/projects/Detic/configs/detic_centernet2_r50_fpn.py
index 602ca0f3635..1d86e4b1bef 100644
--- a/projects/Detic/configs/detic_centernet2_r50_fpn.py
+++ b/projects/Detic/configs/detic_centernet2_r50_fpn.py
@@ -76,9 +76,10 @@
         stage_loss_weights=[1, 0.5, 0.25],
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0, use_torchvision=True),
             out_channels=256,
-            featmap_strides=[8, 16, 32]),
+            featmap_strides=[8, 16, 32],
+            finest_scale=112),  # approximately equal to canonical_box_size=224, canonical_level=4 in D2
         bbox_head=[
             dict(
                 type='Shared2FCBBoxHead',

From a1d3a077c8255a6a8aeac600d8c7784ad6dd49bd Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Thu, 19 Jan 2023 14:59:55 +0800
Subject: [PATCH 07/12] align inference

---
 .../Detic/configs/detic_centernet2_r50_fpn.py |  28 ++--
 projects/Detic/detic/__init__.py              |   5 +
 projects/Detic/detic/centernet_rpn_head.py    | 142 +++---------------
 projects/Detic/detic/detic_bbox_head.py       | 112 ++++++++++++++
 projects/Detic/detic/detic_roi_head.py        |  29 ++--
 projects/Detic/detic/zero_shot_classifier.py  |   2 +-
 6 files changed, 167 insertions(+), 151 deletions(-)
 create mode 100644 projects/Detic/detic/detic_bbox_head.py

diff --git a/projects/Detic/configs/detic_centernet2_r50_fpn.py b/projects/Detic/configs/detic_centernet2_r50_fpn.py
index 1d86e4b1bef..8e9dcf5d018 100644
--- a/projects/Detic/configs/detic_centernet2_r50_fpn.py
+++ b/projects/Detic/configs/detic_centernet2_r50_fpn.py
@@ -19,15 +19,10 @@
     dict(type='Linear', in_features=1024, out_features=4)
 ]
 
-num_classes = 1203 - 1
+num_classes = 1203
 
 model = dict(
     type='CascadeRCNN',
-    init_cfg=dict(
-        type='Pretrained',
-        checkpoint=
-        '/home/rangilyu/projects/Detic/models/Detic_LbaseI_CLIP_R5021k_640b64_4x_ft4x_max-size_mmdet.pth'
-    ),
     data_preprocessor=dict(
         type='DetDataPreprocessor',
         mean=[123.675, 116.28, 103.53],
@@ -76,13 +71,19 @@
         stage_loss_weights=[1, 0.5, 0.25],
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0, use_torchvision=True),
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=0,
+                use_torchvision=True),
             out_channels=256,
             featmap_strides=[8, 16, 32],
-            finest_scale=112),  # approximately equal to canonical_box_size=224, canonical_level=4 in D2
+            # approximately equal to
+            # canonical_box_size=224, canonical_level=4 in D2
+            finest_scale=112),
         bbox_head=[
             dict(
-                type='Shared2FCBBoxHead',
+                type='DeticBBoxHead',
                 in_channels=256,
                 fc_out_channels=1024,
                 roi_feat_size=7,
@@ -100,7 +101,7 @@
                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                                loss_weight=1.0)),
             dict(
-                type='Shared2FCBBoxHead',
+                type='DeticBBoxHead',
                 in_channels=256,
                 fc_out_channels=1024,
                 roi_feat_size=7,
@@ -118,7 +119,7 @@
                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                                loss_weight=1.0)),
             dict(
-                type='Shared2FCBBoxHead',
+                type='DeticBBoxHead',
                 in_channels=256,
                 fc_out_channels=1024,
                 roi_feat_size=7,
@@ -139,7 +140,10 @@
             type='SingleRoIExtractor',
             roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=256,
-            featmap_strides=[8, 16, 32]),
+            featmap_strides=[8, 16, 32],
+            # approximately equal to
+            # canonical_box_size=224, canonical_level=4 in D2
+            finest_scale=112),
         mask_head=dict(
             type='FCNMaskHead',
             num_convs=4,
diff --git a/projects/Detic/detic/__init__.py b/projects/Detic/detic/__init__.py
index 51d35020e82..d0ad070259a 100644
--- a/projects/Detic/detic/__init__.py
+++ b/projects/Detic/detic/__init__.py
@@ -1,4 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .centernet_rpn_head import CenterNetRPNHead
+from .detic_bbox_head import DeticBBoxHead
 from .detic_roi_head import DeticRoIHead
 from .zero_shot_classifier import ZeroShotClassifier
+
+__all__ = [
+    'CenterNetRPNHead', 'DeticBBoxHead', 'DeticRoIHead', 'ZeroShotClassifier'
+]
diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py
index 7980c5d37b2..ad27317cb29 100644
--- a/projects/Detic/detic/centernet_rpn_head.py
+++ b/projects/Detic/detic/centernet_rpn_head.py
@@ -1,21 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional, Sequence, Tuple
 import copy
+from typing import List, Sequence, Tuple
+
 import torch
 import torch.nn as nn
 from mmcv.cnn import Scale
-from mmcv.ops import batched_nms
 from mmengine import ConfigDict
 from mmengine.structures import InstanceData
 from torch import Tensor
 
 from mmdet.models.dense_heads import CenterNetUpdateHead
-from mmdet.models.utils import multi_apply, filter_scores_and_topk
+from mmdet.models.utils import multi_apply
 from mmdet.registry import MODELS
-from mmdet.structures.bbox import (bbox2distance, cat_boxes, get_box_tensor,
-                                   get_box_wh, scale_boxes)
-from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
-                         OptInstanceList, reduce_mean)
 
 INF = 1000000000
 RangeType = Sequence[Tuple[int, int]]
@@ -24,84 +20,10 @@
 @MODELS.register_module()
 class CenterNetRPNHead(CenterNetUpdateHead):
     """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2.
-    Paper link `<https://arxiv.org/abs/2103.07461>`_.
 
-    Args:
-        num_classes (int): Number of categories excluding the background
-            category.
-        in_channels (int): Number of channel in the input feature map.
-        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
-            level points.
-        hm_min_radius (int): Heatmap target minimum radius of cls branch.
-            Defaults to 4.
-        hm_min_overlap (float): Heatmap target minimum overlap of cls branch.
-            Defaults to 0.8.
-        more_pos_thresh (float): The filtering threshold when the cls branch
-            adds more positive samples. Defaults to 0.2.
-        more_pos_topk (int): The maximum number of additional positive samples
-            added to each gt. Defaults to 9.
-        soft_weight_on_reg (bool): Whether to use the soft target of the
-            cls branch as the soft weight of the bbox branch.
-            Defaults to False.
-        loss_cls (:obj:`ConfigDict` or dict): Config of cls loss. Defaults to
-            dict(type='GaussianFocalLoss', loss_weight=1.0)
-        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox loss. Defaults to
-             dict(type='GIoULoss', loss_weight=2.0).
-        norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct
-            and config norm layer.  Defaults to
-            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
-        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
-            Unused in CenterNet. Reserved for compatibility with
-            SingleStageDetector.
-        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config
-            of CenterNet.
+    Paper link `<https://arxiv.org/abs/2103.07461>`_.
     """
 
-    def __init__(self,
-                 num_classes: int,
-                 in_channels: int,
-                 regress_ranges: RangeType = ((0, 80), (64, 160), (128, 320),
-                                              (256, 640), (512, INF)),
-                 hm_min_radius: int = 4,
-                 hm_min_overlap: float = 0.8,
-                 more_pos_thresh: float = 0.2,
-                 more_pos_topk: int = 9,
-                 soft_weight_on_reg: bool = False,
-                 loss_cls: ConfigType = dict(
-                     type='GaussianFocalLoss',
-                     pos_weight=0.25,
-                     neg_weight=0.75,
-                     loss_weight=1.0),
-                 loss_bbox: ConfigType = dict(
-                     type='GIoULoss', loss_weight=2.0),
-                 norm_cfg: OptConfigType = dict(
-                     type='GN', num_groups=32, requires_grad=True),
-                 train_cfg: OptConfigType = None,
-                 test_cfg: OptConfigType = None,
-                 **kwargs) -> None:
-        super().__init__(
-            num_classes=num_classes,
-            in_channels=in_channels,
-            loss_cls=loss_cls,
-            loss_bbox=loss_bbox,
-            norm_cfg=norm_cfg,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            **kwargs)
-        self.soft_weight_on_reg = soft_weight_on_reg
-        self.hm_min_radius = hm_min_radius
-        self.more_pos_thresh = more_pos_thresh
-        self.more_pos_topk = more_pos_topk
-        self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap)
-        self.sigmoid_clamp = 0.0001
-
-        # GaussianFocalLoss must be sigmoid mode
-        self.use_sigmoid_cls = True
-        self.cls_out_channels = num_classes
-
-        self.regress_ranges = regress_ranges
-        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
-
     def _init_layers(self) -> None:
         """Initialize layers of the head."""
         self._init_reg_convs()
@@ -211,7 +133,6 @@ def _predict_by_feat_single(self,
 
         cfg = self.test_cfg if cfg is None else cfg
         cfg = copy.deepcopy(cfg)
-        img_shape = img_meta['img_shape']
         nms_pre = cfg.get('nms_pre', -1)
 
         mlvl_bbox_preds = []
@@ -236,51 +157,34 @@ def _predict_by_feat_single(self,
             pre_nms_top_n = candidate_inds.sum()  # N
             pre_nms_top_n = pre_nms_top_n.clamp(max=nms_pre)  # N
 
-            per_box_cls = heatmap  # HW x C
-            per_candidate_inds = candidate_inds  # n
-            per_box_cls = per_box_cls[per_candidate_inds]  # n
+            heatmap = heatmap[candidate_inds]  # n
 
-            per_candidate_nonzeros = per_candidate_inds.nonzero()  # n
-            per_box_loc = per_candidate_nonzeros[:, 0]  # n
-            per_class = per_candidate_nonzeros[:, 1]  # n
+            candidate_nonzeros = candidate_inds.nonzero()  # n
+            box_loc = candidate_nonzeros[:, 0]  # n
+            labels = candidate_nonzeros[:, 1]  # n
 
-            per_box_regression = bbox_pred  # HW x 4
-            per_box_regression = per_box_regression[per_box_loc]  # n x 4
-            per_grids = priors[per_box_loc]  # n x 2
+            bbox_pred = bbox_pred[box_loc]  # n x 4
+            per_grids = priors[box_loc]  # n x 2
 
-            per_pre_nms_top_n = pre_nms_top_n
-
-            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
-                per_box_cls, top_k_indices = \
-                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
-                per_class = per_class[top_k_indices]
-                per_box_regression = per_box_regression[top_k_indices]
+            if candidate_inds.sum().item() > pre_nms_top_n.item():
+                heatmap, top_k_indices = \
+                    heatmap.topk(pre_nms_top_n, sorted=False)
+                labels = labels[top_k_indices]
+                bbox_pred = bbox_pred[top_k_indices]
                 per_grids = per_grids[top_k_indices]
 
-            # TODO: replace with box coder
-            detections = torch.stack([
-                per_grids[:, 0] - per_box_regression[:, 0],
-                per_grids[:, 1] - per_box_regression[:, 1],
-                per_grids[:, 0] + per_box_regression[:, 2],
-                per_grids[:, 1] + per_box_regression[:, 3],
-            ], dim=1)  # n x 4
-
+            bboxes = self.bbox_coder.decode(per_grids, bbox_pred)
             # avoid invalid boxes in RoI heads
-            detections[:, 2] = torch.max(detections[:, 2], detections[:, 0] + 0.01)
-            detections[:, 3] = torch.max(detections[:, 3], detections[:, 1] + 0.01)
+            bboxes[:, 2] = torch.max(bboxes[:, 2], bboxes[:, 0] + 0.01)
+            bboxes[:, 3] = torch.max(bboxes[:, 3], bboxes[:, 1] + 0.01)
 
-
-            mlvl_bbox_preds.append(detections)
+            mlvl_bbox_preds.append(bboxes)
             mlvl_valid_priors.append(priors)
-            mlvl_scores.append(torch.sqrt(per_box_cls))
-            mlvl_labels.append(per_class)
-
-        bbox_pred = torch.cat(mlvl_bbox_preds)
-        # priors = cat_boxes(mlvl_valid_priors)
-        # bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+            mlvl_scores.append(torch.sqrt(heatmap))
+            mlvl_labels.append(labels)
 
         results = InstanceData()
-        results.bboxes = bbox_pred
+        results.bboxes = torch.cat(mlvl_bbox_preds)
         results.scores = torch.cat(mlvl_scores)
         results.labels = torch.cat(mlvl_labels)
 
@@ -289,4 +193,4 @@ def _predict_by_feat_single(self,
             cfg=cfg,
             rescale=rescale,
             with_nms=with_nms,
-            img_meta=img_meta)
\ No newline at end of file
+            img_meta=img_meta)
diff --git a/projects/Detic/detic/detic_bbox_head.py b/projects/Detic/detic/detic_bbox_head.py
new file mode 100644
index 00000000000..a88aac702d4
--- /dev/null
+++ b/projects/Detic/detic/detic_bbox_head.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.roi_heads.bbox_heads import Shared2FCBBoxHead
+from mmdet.models.utils import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import get_box_tensor, scale_boxes
+
+
+@MODELS.register_module()
+class DeticBBoxHead(Shared2FCBBoxHead):
+
+    def __init__(self,
+                 *args,
+                 init_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        # reconstruct fc_cls and fc_reg since input channels are changed
+        assert self.with_cls
+        cls_channels = self.num_classes
+        cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
+        cls_predictor_cfg_.update(
+            in_features=self.cls_last_dim, out_features=cls_channels)
+        self.fc_cls = MODELS.build(cls_predictor_cfg_)
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tensor,
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas.
+                has shape (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image\
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+        if roi.shape[0] == 0:
+            return empty_instances([img_meta],
+                                   roi.device,
+                                   task_type='bbox',
+                                   instance_results=[results],
+                                   box_type=self.predict_box_type,
+                                   use_box_type=False,
+                                   num_classes=self.num_classes,
+                                   score_per_cls=rcnn_test_cfg is None)[0]
+        scores = cls_score
+        img_shape = img_meta['img_shape']
+        num_rois = roi.size(0)
+
+        num_classes = 1 if self.reg_class_agnostic else self.num_classes
+        roi = roi.repeat_interleave(num_classes, dim=0)
+        bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size)
+        bboxes = self.bbox_coder.decode(
+            roi[..., 1:], bbox_pred, max_shape=img_shape)
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            bboxes = scale_boxes(bboxes, scale_factor)
+
+        # Get the inside tensor when `bboxes` is a box type
+        bboxes = get_box_tensor(bboxes)
+        box_dim = bboxes.size(-1)
+        bboxes = bboxes.view(num_rois, -1)
+
+        if rcnn_test_cfg is None:
+            # This means that it is aug test.
+            # It needs to return the raw results without nms.
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms,
+                rcnn_test_cfg.max_per_img,
+                box_dim=box_dim)
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_bboxes[:, -1]
+            results.labels = det_labels
+        return results
diff --git a/projects/Detic/detic/detic_roi_head.py b/projects/Detic/detic/detic_roi_head.py
index 73273711b86..7b870db7f8d 100644
--- a/projects/Detic/detic/detic_roi_head.py
+++ b/projects/Detic/detic/detic_roi_head.py
@@ -1,21 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Sequence, Tuple, Union
+from typing import List, Sequence, Tuple
 
 import torch
-import torch.nn as nn
-from mmengine.model import ModuleList
 from mmengine.structures import InstanceData
 from torch import Tensor
 
 from mmdet.models.roi_heads import CascadeRoIHead
 from mmdet.models.task_modules.samplers import SamplingResult
 from mmdet.models.test_time_augs import merge_aug_masks
-from mmdet.models.utils.misc import empty_instances, unpack_gt_instances
-from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.models.utils.misc import empty_instances
+from mmdet.registry import MODELS
 from mmdet.structures import SampleList
 from mmdet.structures.bbox import bbox2roi, get_box_tensor
-from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
-                         OptMultiConfig)
+from mmdet.utils import ConfigType, InstanceList, MultiConfig
 
 
 @MODELS.register_module()
@@ -179,19 +176,13 @@ def predict_bbox(self,
             num_proposals_per_img=num_proposals_per_img,
             **kwargs)
 
-        # TODO
-        self.mult_proposal_score = True
-        self.one_class_per_proposal = True
-
-        # centernet2
-        if self.mult_proposal_score:  # True
-            cls_scores = [(s * ps[:, None]) ** 0.5 \
+        # score reweighting in centernet2
+        cls_scores = [(s * ps[:, None])**0.5
                       for s, ps in zip(cls_scores, proposal_scores)]
-        if self.one_class_per_proposal:  # True
-            cls_scores = [
-                s * (s == s[:, :-1].max(dim=1)[0][:, None]).float()
-                for s in cls_scores
-            ]
+        cls_scores = [
+            s * (s == s[:, :-1].max(dim=1)[0][:, None]).float()
+            for s in cls_scores
+        ]
 
         # fast_rcnn_inference
         results_list = self.bbox_head[-1].predict_by_feat(
diff --git a/projects/Detic/detic/zero_shot_classifier.py b/projects/Detic/detic/zero_shot_classifier.py
index 8fbb886b0f5..cb9946d5825 100644
--- a/projects/Detic/detic/zero_shot_classifier.py
+++ b/projects/Detic/detic/zero_shot_classifier.py
@@ -1,4 +1,4 @@
-# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 import numpy as np
 import torch
 from torch import nn

From 7c24d64512288b59dd611a19e0c72b6ce2a02624 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Thu, 19 Jan 2023 17:35:12 +0800
Subject: [PATCH 08/12] support zero shot inference

---
 .../Detic/configs/detic_centernet2_r50_fpn.py |   2 +-
 projects/Detic/demo.py                        | 144 ++++++++++++++++++
 projects/Detic/detic/centernet_rpn_head.py    |   2 +-
 projects/Detic/detic/detic_bbox_head.py       |   2 +-
 projects/Detic/detic/detic_roi_head.py        |   2 +-
 projects/Detic/detic/text_encoder.py          |  50 ++++++
 projects/Detic/detic/utils.py                 |  75 +++++++++
 projects/Detic/detic/zero_shot_classifier.py  |   2 +-
 8 files changed, 274 insertions(+), 5 deletions(-)
 create mode 100644 projects/Detic/demo.py
 create mode 100644 projects/Detic/detic/text_encoder.py
 create mode 100644 projects/Detic/detic/utils.py

diff --git a/projects/Detic/configs/detic_centernet2_r50_fpn.py b/projects/Detic/configs/detic_centernet2_r50_fpn.py
index 8e9dcf5d018..621151eb057 100644
--- a/projects/Detic/configs/detic_centernet2_r50_fpn.py
+++ b/projects/Detic/configs/detic_centernet2_r50_fpn.py
@@ -238,7 +238,7 @@
             nms=dict(type='nms', iou_threshold=0.9),
             min_bbox_size=0),
         rcnn=dict(
-            score_thr=0.5,
+            score_thr=0.02,
             nms=dict(type='nms', iou_threshold=0.5),
             max_per_img=300,
             mask_thr_binary=0.5)))
diff --git a/projects/Detic/demo.py b/projects/Detic/demo.py
new file mode 100644
index 00000000000..234af362e2f
--- /dev/null
+++ b/projects/Detic/demo.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from argparse import ArgumentParser
+
+import mmcv
+from mmdet.apis import inference_detector, init_detector
+from mmengine.logging import print_log
+from mmengine.utils import ProgressBar
+import urllib
+import torch
+from mmengine.utils import scandir
+
+from mmdet.registry import VISUALIZERS
+from mmdet.utils import register_all_modules
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def get_file_list(source_root: str) -> [list, dict]:
+    """Get file list.
+
+    Args:
+        source_root (str): image or video source path
+
+    Return:
+        source_file_path_list (list): A list for all source file.
+        source_type (dict): Source type: file or url or dir.
+    """
+    is_dir = os.path.isdir(source_root)
+    is_url = source_root.startswith(('http:/', 'https:/'))
+    is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS
+
+    source_file_path_list = []
+    if is_dir:
+        # when input source is dir
+        for file in scandir(source_root, IMG_EXTENSIONS, recursive=True):
+            source_file_path_list.append(os.path.join(source_root, file))
+    elif is_url:
+        # when input source is url
+        filename = os.path.basename(
+            urllib.parse.unquote(source_root).split('?')[0])
+        file_save_path = os.path.join(os.getcwd(), filename)
+        print(f'Downloading source file to {file_save_path}')
+        torch.hub.download_url_to_file(source_root, file_save_path)
+        source_file_path_list = [file_save_path]
+    elif is_file:
+        # when input source is single image
+        source_file_path_list = [source_root]
+    else:
+        print('Cannot find image file.')
+
+    source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file)
+
+    return source_file_path_list, source_type
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        'img', help='Image path, include image file, dir and URL.')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--out-dir', default='./output', help='Path to output file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--show', action='store_true', help='Show the detection results')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.3, help='Bbox score threshold')
+    parser.add_argument(
+        '--dataset', type=str, help='dataset name to load the text embedding')
+    parser.add_argument(
+        '--class-name',
+        nargs='+',
+        type=str,
+        help='Only Save those classes if set')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # register all modules in mmdet into the registries
+    register_all_modules()
+
+    # build the model from a config file and a checkpoint file
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+
+    if not os.path.exists(args.out_dir) and not args.show:
+        os.mkdir(args.out_dir)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    # get file list
+    files, source_type = get_file_list(args.img)
+    from detic.utils import get_text_embeddings, reset_cls_layer_weight, get_class_names
+
+    # class name embeddings
+    if args.class_name:
+        dataset_classes = args.class_name
+    elif args.dataset:
+        dataset_classes = get_class_names(args.dataset)
+    embedding = get_text_embeddings(dataset=args.dataset, custom_vocabulary=args.class_name)
+    visualizer.dataset_meta['classes'] = dataset_classes
+    reset_cls_layer_weight(model, embedding)
+
+    # start detector inference
+    progress_bar = ProgressBar(len(files))
+    for file in files:
+        result = inference_detector(model, file)
+
+        img = mmcv.imread(file)
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
+
+        if source_type['is_dir']:
+            filename = os.path.relpath(file, args.img).replace('/', '_')
+        else:
+            filename = os.path.basename(file)
+        out_file = None if args.show else os.path.join(args.out_dir, filename)
+
+        progress_bar.update()
+
+        visualizer.add_datasample(
+            filename,
+            img,
+            data_sample=result,
+            draw_gt=False,
+            show=args.show,
+            wait_time=0,
+            out_file=out_file,
+            pred_score_thr=args.score_thr)
+
+    if not args.show:
+        print_log(
+            f'\nResults have been saved at {os.path.abspath(args.out_dir)}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py
index ad27317cb29..765d6dfb2b6 100644
--- a/projects/Detic/detic/centernet_rpn_head.py
+++ b/projects/Detic/detic/centernet_rpn_head.py
@@ -17,7 +17,7 @@
 RangeType = Sequence[Tuple[int, int]]
 
 
-@MODELS.register_module()
+@MODELS.register_module(force=True)  # avoid bug
 class CenterNetRPNHead(CenterNetUpdateHead):
     """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2.
 
diff --git a/projects/Detic/detic/detic_bbox_head.py b/projects/Detic/detic/detic_bbox_head.py
index a88aac702d4..9408cbe04fd 100644
--- a/projects/Detic/detic/detic_bbox_head.py
+++ b/projects/Detic/detic/detic_bbox_head.py
@@ -12,7 +12,7 @@
 from mmdet.structures.bbox import get_box_tensor, scale_boxes
 
 
-@MODELS.register_module()
+@MODELS.register_module(force=True)  # avoid bug
 class DeticBBoxHead(Shared2FCBBoxHead):
 
     def __init__(self,
diff --git a/projects/Detic/detic/detic_roi_head.py b/projects/Detic/detic/detic_roi_head.py
index 7b870db7f8d..a09c11c6e69 100644
--- a/projects/Detic/detic/detic_roi_head.py
+++ b/projects/Detic/detic/detic_roi_head.py
@@ -15,7 +15,7 @@
 from mmdet.utils import ConfigType, InstanceList, MultiConfig
 
 
-@MODELS.register_module()
+@MODELS.register_module(force=True)  # avoid bug
 class DeticRoIHead(CascadeRoIHead):
 
     def init_mask_head(self, mask_roi_extractor: MultiConfig,
diff --git a/projects/Detic/detic/text_encoder.py b/projects/Detic/detic/text_encoder.py
new file mode 100644
index 00000000000..f0024efaf30
--- /dev/null
+++ b/projects/Detic/detic/text_encoder.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+
+
+class CLIPTextEncoder(nn.Module):
+
+    def __init__(self, model_name='ViT-B/32'):
+        super().__init__()
+        import clip
+        from clip.simple_tokenizer import SimpleTokenizer
+        self.tokenizer = SimpleTokenizer()
+        pretrained_model, _ = clip.load(model_name, device='cpu')
+        self.clip = pretrained_model
+
+    @property
+    def device(self):
+        return self.clip.device
+
+    @property
+    def dtype(self):
+        return self.clip.dtype
+
+    def tokenize(self,
+                 texts: Union[str, List[str]],
+                 context_length: int = 77) -> torch.LongTensor:
+        if isinstance(texts, str):
+            texts = [texts]
+
+        sot_token = self.tokenizer.encoder['<|startoftext|>']
+        eot_token = self.tokenizer.encoder['<|endoftext|>']
+        all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token]
+                      for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                st = torch.randint(len(tokens) - context_length + 1,
+                                   (1, ))[0].item()
+                tokens = tokens[st:st + context_length]
+            result[i, :len(tokens)] = torch.tensor(tokens)
+
+        return result
+
+    def forward(self, text):
+        text = self.tokenize(text)
+        text_features = self.clip.encode_text(text)
+        return text_features
diff --git a/projects/Detic/detic/utils.py b/projects/Detic/detic/utils.py
new file mode 100644
index 00000000000..a7d5f026caa
--- /dev/null
+++ b/projects/Detic/detic/utils.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from .text_encoder import CLIPTextEncoder
+
+
+def get_text_embeddings(dataset=None,
+                        custom_vocabulary=None,
+                        prompt_prefix='a '):
+    assert (dataset is None) ^ (custom_vocabulary is None), \
+        'Either `dataset` or `custom_vocabulary` should be specified.'
+    if dataset:
+        assert dataset in DATASET_EMBEDDINGS
+        return DATASET_EMBEDDINGS[dataset]
+    elif custom_vocabulary:
+        text_encoder = CLIPTextEncoder()
+        text_encoder.eval()
+        texts = [prompt_prefix + x for x in custom_vocabulary]
+        embeddings = text_encoder(texts).detach().permute(
+            1, 0).contiguous().cpu()
+        return embeddings
+    else:
+        raise NotImplementedError
+
+
+DATASET_EMBEDDINGS = {
+    'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy',
+    'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy',
+    'openimages': 'datasets/metadata/oid_clip_a+cname.npy',
+    'coco': 'datasets/metadata/coco_clip_a+cname.npy',
+}
+
+
+def get_class_names(dataset):
+    if dataset == 'coco':
+        from mmdet.datasets import CocoDataset
+        class_names = CocoDataset.METAINFO['classes']
+    elif dataset == 'cityscapes':
+        from mmdet.datasets import CityscapesDataset
+        class_names = CityscapesDataset.METAINFO['classes']
+    elif dataset == 'voc':
+        from mmdet.datasets import VOCDataset
+        class_names = VOCDataset.METAINFO['classes']
+    elif dataset == 'openimages':
+        from mmdet.datasets import OpenImagesDataset
+        class_names = OpenImagesDataset.METAINFO['classes']
+    elif dataset == 'lvis':
+        from mmdet.datasets import LVISV1Dataset
+        class_names = LVISV1Dataset.METAINFO['classes']
+    else:
+        raise TypeError(f'Invalid type for dataset name: {type(dataset)}')
+    return class_names
+
+
+def reset_cls_layer_weight(model, weight):
+    if type(weight) == str:
+        print('Resetting cls_layer_weight from file: ', weight)
+        zs_weight = torch.tensor(
+            np.load(weight),
+            dtype=torch.float32).permute(1, 0).contiguous()  # D x C
+    else:
+        zs_weight = weight
+    zs_weight = torch.cat(
+        [zs_weight, zs_weight.new_zeros(
+            (zs_weight.shape[0], 1))], dim=1)  # D x (C + 1)
+    zs_weight = F.normalize(zs_weight, p=2, dim=0)
+    zs_weight = zs_weight.to('cuda')
+    num_classes = zs_weight.shape[-1]
+
+    for bbox_head in model.roi_head.bbox_head:
+        bbox_head.num_classes = num_classes
+        del bbox_head.fc_cls.zs_weight
+        bbox_head.fc_cls.zs_weight = zs_weight
diff --git a/projects/Detic/detic/zero_shot_classifier.py b/projects/Detic/detic/zero_shot_classifier.py
index cb9946d5825..35c9e49285c 100644
--- a/projects/Detic/detic/zero_shot_classifier.py
+++ b/projects/Detic/detic/zero_shot_classifier.py
@@ -7,7 +7,7 @@
 from mmdet.registry import MODELS
 
 
-@MODELS.register_module()
+@MODELS.register_module(force=True)  # avoid bug
 class ZeroShotClassifier(nn.Module):
 
     def __init__(

From 8929acb8a37048e69a9e4401cb6c903691a140d5 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Thu, 19 Jan 2023 17:35:42 +0800
Subject: [PATCH 09/12] lint

---
 projects/Detic/demo.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/projects/Detic/demo.py b/projects/Detic/demo.py
index 234af362e2f..01b87d78e51 100644
--- a/projects/Detic/demo.py
+++ b/projects/Detic/demo.py
@@ -1,15 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
+import urllib
 from argparse import ArgumentParser
 
 import mmcv
-from mmdet.apis import inference_detector, init_detector
-from mmengine.logging import print_log
-from mmengine.utils import ProgressBar
-import urllib
 import torch
-from mmengine.utils import scandir
+from mmengine.logging import print_log
+from mmengine.utils import ProgressBar, scandir
 
+from mmdet.apis import inference_detector, init_detector
 from mmdet.registry import VISUALIZERS
 from mmdet.utils import register_all_modules
 
@@ -98,14 +97,16 @@ def main():
 
     # get file list
     files, source_type = get_file_list(args.img)
-    from detic.utils import get_text_embeddings, reset_cls_layer_weight, get_class_names
+    from detic.utils import (get_class_names, get_text_embeddings,
+                             reset_cls_layer_weight)
 
     # class name embeddings
     if args.class_name:
         dataset_classes = args.class_name
     elif args.dataset:
         dataset_classes = get_class_names(args.dataset)
-    embedding = get_text_embeddings(dataset=args.dataset, custom_vocabulary=args.class_name)
+    embedding = get_text_embeddings(
+        dataset=args.dataset, custom_vocabulary=args.class_name)
     visualizer.dataset_meta['classes'] = dataset_classes
     reset_cls_layer_weight(model, embedding)
 

From 0beaaca60a4f7b22646c7fe02691896e49addff3 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Thu, 19 Jan 2023 18:31:50 +0800
Subject: [PATCH 10/12] add readme

---
 projects/Detic/README.md | 150 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 projects/Detic/README.md

diff --git a/projects/Detic/README.md b/projects/Detic/README.md
new file mode 100644
index 00000000000..b21181fd9cf
--- /dev/null
+++ b/projects/Detic/README.md
@@ -0,0 +1,150 @@
+# Detecting Twenty-thousand Classes using Image-level Supervision
+
+## Description
+
+**Detic**: A **Det**ector with **i**mage **c**lasses that can use image-level labels to easily train detectors.
+
+<p align="center"> <img src='https://github.com/facebookresearch/Detic/blob/main/docs/teaser.jpeg?raw=true' align="center" height="300px"> </p>
+
+> [**Detecting Twenty-thousand Classes using Image-level Supervision**](http://arxiv.org/abs/2201.02605),
+> Xingyi Zhou, Rohit Girdhar, Armand Joulin, Philipp Krähenbühl, Ishan Misra,
+> *ECCV 2022 ([arXiv 2201.02605](http://arxiv.org/abs/2201.02605))*
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+## Installation
+
+Detic requires to install CLIP.
+
+```shell
+pip install git+https://github.com/openai/CLIP.git
+```
+
+### Demo
+
+#### Inference with existing dataset vocabulary embeddings
+
+First, go to the Detic project folder.
+
+```shell
+cd projects/Detic
+```
+
+Then, download the [dataset metainfo](https://github.com/facebookresearch/Detic/tree/main/datasets/metadata) to the `datasets/metadata` folder. For example, you can download LVIS metainfo with the following command:
+
+```shell
+wget -P datasets/metadata https://raw.githubusercontent.com/facebookresearch/Detic/main/datasets/metadata/lvis_v1_clip_a%2Bcname.npy
+```
+
+You can run demo like this:
+
+```shell
+python demo.py \
+  ${IMAGE_PATH} \
+  ${CONFIG_PATH} \
+  ${MODEL_PATH} \
+  --show \
+  --score-thr 0.5 \
+  --dataset lvis
+```
+
+![image](https://user-images.githubusercontent.com/12907710/213418957-f105f9a2-1e28-4c98-9c43-ea7cdf27b19c.png)
+
+### Inference with custom vocabularies
+
+- Detic can detects any class given class names by using CLIP.
+
+You can detect custom classes with `--class-name` command:
+
+```
+python demo.py \
+  ${IMAGE_PATH} \
+  ${CONFIG_PATH} \
+  ${MODEL_PATH} \
+  --show \
+  --score-thr 0.3 \
+  --class-name headphone webcam paper coffe
+```
+
+![image](https://user-images.githubusercontent.com/12907710/213418548-64deab8c-3fe4-4988-8d1f-e6320dc0af6d.png)
+
+## Results
+
+Here we provide the baseline version of SparseInst with ResNet50 backbone.
+
+To find more variants, please visit the [official model zoo](https://github.com/facebookresearch/Detic/blob/main/docs/MODEL_ZOO.md).
+
+| Backbone |    Training data    | Lr schd | Mem (GB) | FPS | mask AP |                     Config                      |        Download        |
+| :------: | :-----------------: | :-----: | :------: | :-: | :-----: | :---------------------------------------------: | :--------------------: |
+|   R-50   | ImageNet-21K & LVIS |    -    |    -     |  -  |  32.4   | [config](./configs/detic_centernet2_r50_fpn.py) | [model](-) \| [log](-) |
+
+## Citation
+
+If you find Detic is useful in your research or applications, please consider giving a star 🌟 to the [official repository](https://github.com/facebookresearch/Detic) and citing Detic by the following BibTeX entry.
+
+```BibTeX
+@inproceedings{zhou2022detecting,
+  title={Detecting Twenty-thousand Classes using Image-level Supervision},
+  author={Zhou, Xingyi and Girdhar, Rohit and Joulin, Armand and Kr{\"a}henb{\"u}hl, Philipp and Misra, Ishan},
+  booktitle={ECCV},
+  year={2022}
+}
+
+```
+
+## Checklist
+
+<!-- Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress. The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmdet.registry.MODELS` and configurable via a config file. -->
+
+  - [x] Basic docstrings & proper citation
+
+    <!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [x] Test-time correctness
+
+    <!-- If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
+
+  - [x] A full README
+
+    <!-- As this template does. -->
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+    <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+    <!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmdetection/blob/5b0d5b40d5c6cfda906db7464ca22cbd4396728a/mmdet/datasets/transforms/transforms.py#L41-L169) -->
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmdetection/blob/5b0d5b40d5c6cfda906db7464ca22cbd4396728a/tests/test_datasets/test_transforms/test_transforms.py#L35-L88) -->
+
+  - [ ] Code polishing
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] Metafile.yml
+
+    <!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmdetection/blob/3.x/configs/faster_rcnn/metafile.yml) -->
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+  <!-- In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmdetection/blob/3.x/configs/faster_rcnn/README.md) -->
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.

From bdc74478fff57e5bbd0750e4815f74307c617934 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Fri, 20 Jan 2023 14:01:51 +0800
Subject: [PATCH 11/12] add swin demo

---
 projects/Detic/README.md                      | 18 ++++----
 ...nternet2_swin-b_fpn_4x_lvis-coco-in21k.py} | 25 ++++++-----
 projects/Detic/detic/utils.py                 | 41 ++++++++++---------
 3 files changed, 48 insertions(+), 36 deletions(-)
 rename projects/Detic/configs/{detic_centernet2_r50_fpn.py => detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py} (96%)

diff --git a/projects/Detic/README.md b/projects/Detic/README.md
index b21181fd9cf..4e99779342d 100644
--- a/projects/Detic/README.md
+++ b/projects/Detic/README.md
@@ -32,7 +32,9 @@ First, go to the Detic project folder.
 cd projects/Detic
 ```
 
-Then, download the [dataset metainfo](https://github.com/facebookresearch/Detic/tree/main/datasets/metadata) to the `datasets/metadata` folder. For example, you can download LVIS metainfo with the following command:
+Then, download the pre-computed CLIP embeddings from [dataset metainfo](https://github.com/facebookresearch/Detic/tree/main/datasets/metadata) to the `datasets/metadata` folder.
+The CLIP embeddings will be loaded to the zero-shot classifier during inference.
+For example, you can download LVIS's class name embeddings with the following command:
 
 ```shell
 wget -P datasets/metadata https://raw.githubusercontent.com/facebookresearch/Detic/main/datasets/metadata/lvis_v1_clip_a%2Bcname.npy
@@ -50,7 +52,7 @@ python demo.py \
   --dataset lvis
 ```
 
-![image](https://user-images.githubusercontent.com/12907710/213418957-f105f9a2-1e28-4c98-9c43-ea7cdf27b19c.png)
+![image](https://user-images.githubusercontent.com/12907710/213624759-f0a2ba0c-0f5c-4424-a350-5ba5349e5842.png)
 
 ### Inference with custom vocabularies
 
@@ -68,17 +70,19 @@ python demo.py \
   --class-name headphone webcam paper coffe
 ```
 
-![image](https://user-images.githubusercontent.com/12907710/213418548-64deab8c-3fe4-4988-8d1f-e6320dc0af6d.png)
+![image](https://user-images.githubusercontent.com/12907710/213624637-e9e8a313-9821-4782-a18a-4408c876852b.png)
+
+Note that `headphone`, `paper` and `coffe` (typo intended) are not LVIS classes. Despite the misspelled class name, Detic can produce a reasonable detection for `coffe`.
 
 ## Results
 
-Here we provide the baseline version of SparseInst with ResNet50 backbone.
+Here we only provide the Detic Swin-B model for the open vocabulary demo. Multi-dataset training and open-vocabulary testing will be supported in the future.
 
 To find more variants, please visit the [official model zoo](https://github.com/facebookresearch/Detic/blob/main/docs/MODEL_ZOO.md).
 
-| Backbone |    Training data    | Lr schd | Mem (GB) | FPS | mask AP |                     Config                      |        Download        |
-| :------: | :-----------------: | :-----: | :------: | :-: | :-----: | :---------------------------------------------: | :--------------------: |
-|   R-50   | ImageNet-21K & LVIS |    -    |    -     |  -  |  32.4   | [config](./configs/detic_centernet2_r50_fpn.py) | [model](-) \| [log](-) |
+| Backbone |       Training data        |                                Config                                 |                                                                                      Download                                                                                      |
+| :------: | :------------------------: | :-------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Swin-B  | ImageNet-21K & LVIS & COCO | [config](./configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k_20230120-0d301978.pth) |
 
 ## Citation
 
diff --git a/projects/Detic/configs/detic_centernet2_r50_fpn.py b/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py
similarity index 96%
rename from projects/Detic/configs/detic_centernet2_r50_fpn.py
rename to projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py
index 621151eb057..19a17aea7bc 100644
--- a/projects/Detic/configs/detic_centernet2_r50_fpn.py
+++ b/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py
@@ -19,7 +19,7 @@
     dict(type='Linear', in_features=1024, out_features=4)
 ]
 
-num_classes = 1203
+num_classes = 22047
 
 model = dict(
     type='CascadeRCNN',
@@ -31,18 +31,23 @@
         pad_size_divisor=32,
         batch_augments=batch_augments),
     backbone=dict(
-        type='ResNet',
-        depth=50,
-        num_stages=4,
+        type='SwinTransformer',
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
         out_indices=(1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=True),
-        norm_eval=True,
-        style='pytorch',
-        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+        with_cp=False),
     neck=dict(
         type='FPN',
-        in_channels=[512, 1024, 2048],
+        in_channels=[256, 512, 1024],
         out_channels=256,
         start_level=0,
         add_extra_convs='on_output',
diff --git a/projects/Detic/detic/utils.py b/projects/Detic/detic/utils.py
index a7d5f026caa..56d4fd429d7 100644
--- a/projects/Detic/detic/utils.py
+++ b/projects/Detic/detic/utils.py
@@ -2,9 +2,19 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
+from mmengine.logging import print_log
 
 from .text_encoder import CLIPTextEncoder
 
+# download from
+# https://github.com/facebookresearch/Detic/tree/main/datasets/metadata
+DATASET_EMBEDDINGS = {
+    'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy',
+    'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy',
+    'openimages': 'datasets/metadata/oid_clip_a+cname.npy',
+    'coco': 'datasets/metadata/coco_clip_a+cname.npy',
+}
+
 
 def get_text_embeddings(dataset=None,
                         custom_vocabulary=None,
@@ -12,25 +22,18 @@ def get_text_embeddings(dataset=None,
     assert (dataset is None) ^ (custom_vocabulary is None), \
         'Either `dataset` or `custom_vocabulary` should be specified.'
     if dataset:
-        assert dataset in DATASET_EMBEDDINGS
-        return DATASET_EMBEDDINGS[dataset]
-    elif custom_vocabulary:
-        text_encoder = CLIPTextEncoder()
-        text_encoder.eval()
-        texts = [prompt_prefix + x for x in custom_vocabulary]
-        embeddings = text_encoder(texts).detach().permute(
-            1, 0).contiguous().cpu()
-        return embeddings
-    else:
-        raise NotImplementedError
+        if dataset in DATASET_EMBEDDINGS:
+            return DATASET_EMBEDDINGS[dataset]
+        else:
+            custom_vocabulary = get_class_names(dataset)
 
-
-DATASET_EMBEDDINGS = {
-    'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy',
-    'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy',
-    'openimages': 'datasets/metadata/oid_clip_a+cname.npy',
-    'coco': 'datasets/metadata/coco_clip_a+cname.npy',
-}
+    text_encoder = CLIPTextEncoder()
+    text_encoder.eval()
+    texts = [prompt_prefix + x for x in custom_vocabulary]
+    print_log(
+        f'Computing text embeddings for {len(custom_vocabulary)} classes.')
+    embeddings = text_encoder(texts).detach().permute(1, 0).contiguous().cpu()
+    return embeddings
 
 
 def get_class_names(dataset):
@@ -56,7 +59,7 @@ def get_class_names(dataset):
 
 def reset_cls_layer_weight(model, weight):
     if type(weight) == str:
-        print('Resetting cls_layer_weight from file: ', weight)
+        print_log(f'Resetting cls_layer_weight from file: {weight}')
         zs_weight = torch.tensor(
             np.load(weight),
             dtype=torch.float32).permute(1, 0).contiguous()  # D x C

From 3c3d58e0520177f0fc20678cd058dcf565381a59 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Fri, 20 Jan 2023 14:13:09 +0800
Subject: [PATCH 12/12] update demo

---
 projects/Detic/demo.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/projects/Detic/demo.py b/projects/Detic/demo.py
index 01b87d78e51..d5c80c9aa5f 100644
--- a/projects/Detic/demo.py
+++ b/projects/Detic/demo.py
@@ -71,10 +71,7 @@ def parse_args():
     parser.add_argument(
         '--dataset', type=str, help='dataset name to load the text embedding')
     parser.add_argument(
-        '--class-name',
-        nargs='+',
-        type=str,
-        help='Only Save those classes if set')
+        '--class-name', nargs='+', type=str, help='custom class names')
     args = parser.parse_args()
     return args