PaddlePaddle · DrRyanHuang · Feb 21, 2023 · Feb 22, 2023 · Feb 24, 2023 · Feb 24, 2023
diff --git a/configs/diffusiondet/README.md b/configs/diffusiondet/README.md
@@ -0,0 +1,25 @@
+# Sparse R-CNN: End-to-End Object Detection with Learnable Proposals
+
+
+## Introduction
+Sparse RCNN is a purely sparse method for object detection in images.
+
+
+## Model Zoo
+
+| Backbone        | Proposals | lr schedule | Box AP | download   | config |
+| :-------------- | :-----: | :------------: | :-----: | :-----: | :-----: |
+| ResNet50-FPN | 100 | 3x |  43.0  | [download](https://paddledet.bj.bcebos.com/models/sparse_rcnn_r50_fpn_3x_pro100_coco.pdparams) | [config](./sparse_rcnn_r50_fpn_3x_pro100_coco.yml) |
+| ResNet50-FPN | 300 | 3x |  44.6  | [download](https://paddledet.bj.bcebos.com/models/sparse_rcnn_r50_fpn_3x_pro300_coco.pdparams) | [config](./sparse_rcnn_r50_fpn_3x_pro300_coco.yml) |
+
+## Citations
+```
+@misc{sun2021sparse,
+      title={Sparse R-CNN: End-to-End Object Detection with Learnable Proposals},
+      author={Peize Sun and Rufeng Zhang and Yi Jiang and Tao Kong and Chenfeng Xu and Wei Zhan and Masayoshi Tomizuka and Lei Li and Zehuan Yuan and Changhu Wang and Ping Luo},
+      year={2021},
+      eprint={2011.12450},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/configs/diffusiondet/_base_/diffusiondet_r50_fpn.yml b/configs/diffusiondet/_base_/diffusiondet_r50_fpn.yml
@@ -0,0 +1,49 @@
+architecture: DiffusionDet
+# pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams
+
+DiffusionDet:
+  backbone: ResNet
+  neck: FPN
+  head: DiffusionDetHead
+  postprocess: DiffusionDetPostProcess
+
+ResNet:
+  # index 0 stands for res2
+  depth: 50
+  norm_type: bn
+  freeze_at: -1
+  freeze_norm: true
+  return_idx: [0,1,2,3]
+  num_stages: 4
+
+FPN:
+  out_channel: 256
+  extra_stage: 0
+
+DiffusionDetHead:
+  head_hidden_dim: 256
+  head_dim_feedforward: 2048
+  nhead: 8
+  head_dropout: 0.0
+  head_cls: 1
+  head_reg: 3
+  head_dim_dynamic: 64
+  head_num_dynamic: 2
+  head_num_heads: 6
+  deep_supervision: true
+  num_proposals: 100
+  timesteps: 1000
+  sampling_timesteps: 1
+  loss_func: DiffusionDetSparseRCNNLoss
+
+DiffusionDetSparseRCNNLoss:
+  losses: ["labels", "boxes"]
+  focal_loss_alpha: 0.25
+  focal_loss_gamma: 2.0
+  class_weight: 2.0
+  l1_weight: 5.0
+  giou_weight: 2.0
+  eos_coef: 0.1
+
+DiffusionDetPostProcess:
+  num_proposals: 100
diff --git a/configs/diffusiondet/_base_/diffusiondet_reader.yml b/configs/diffusiondet/_base_/diffusiondet_reader.yml
@@ -0,0 +1,46 @@
+worker_num: 2
+
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
+                    transforms2: [
+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
+  }
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  - Gt2SparseTarget: {use_padding_shape: False}
+  batch_size: 2
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 1, target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  - Gt2SparseTarget: {use_padding_shape: False}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+TestReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 1, target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  - Gt2SparseTarget: {use_padding_shape: False}
+  batch_size: 1
+  shuffle: false
diff --git a/configs/diffusiondet/_base_/optimizer_3x.yml b/configs/diffusiondet/_base_/optimizer_3x.yml
@@ -0,0 +1,17 @@
+epoch: 36
+
+LearningRate:
+  base_lr: 0.000025
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones: [28, 34]
+  - !LinearWarmup
+    start_factor: 0.01
+    steps: 1000
+
+OptimizerBuilder:
+  clip_grad_by_norm: 1.0
+  optimizer:
+    type: AdamW
+    weight_decay: 0.0001
diff --git a/configs/diffusiondet/diffusiondet_r50_fpn_3x_pro100_coco.yml b/configs/diffusiondet/diffusiondet_r50_fpn_3x_pro100_coco.yml
@@ -0,0 +1,10 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/diffusiondet_r50_fpn.yml',
+  '_base_/optimizer_3x.yml',
+  '_base_/diffusiondet_reader.yml',
+]
+
+num_classes: 80
+# weights: output/sparse_rcnn_r50_fpn_3x_pro100_coco/model_final
diff --git a/configs/diffusiondet/diffusiondet_r50_fpn_3x_pro500_coco.yml b/configs/diffusiondet/diffusiondet_r50_fpn_3x_pro500_coco.yml
@@ -0,0 +1,21 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/diffusiondet_r50_fpn.yml',
+  '_base_/optimizer_3x.yml',
+  '_base_/diffusiondet_reader.yml',
+]
+
+use_focal: True
+use_fed_loss: False
+num_classes: 80
+weights: output/model_final
+
+snapshot_epoch: 1
+
+
+DiffusionDetHead:
+  num_proposals: 500
+
+DiffusionDetPostProcess:
+  num_proposals: 500
diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py
@@ -67,6 +67,7 @@
 from .picodet import *
 from .detr import *
 from .sparse_rcnn import *
+from .diffusion_det import *
 from .tood import *
 from .retinanet import *
 from .bytetrack import *

diff --git a/ppdet/modeling/architectures/diffusion_det.py b/ppdet/modeling/architectures/diffusion_det.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ["DiffusionDet"]
+
+
+@register
+class DiffusionDet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ["postprocess"]
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 head="DiffusionDetHead",
+                 postprocess="SparsePostProcess"):
+        super(DiffusionDet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.postprocess = postprocess
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'roi_input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs, targets = self.head(fpn_feats, self.inputs)
+
+        if not self.training:
+            h, w = self.inputs['im_shape'][0].numpy().astype(int)
+            bbox_pred, bbox_num = self.postprocess(head_outs[0], h, w,
+                                                   self.inputs['ori_shape'])
+            return bbox_pred, bbox_num
+        else:
+            return head_outs, targets
+
+    def get_loss(self):
+
+        outputs, targets = self._forward()
+        loss_dict = self.head.get_loss(outputs, targets)
+        total_loss = sum(loss_dict.values())
+        loss_dict.update({"loss": total_loss})
+        return loss_dict
+
+    def get_pred(self):
+
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output
diff --git a/ppdet/modeling/heads/__init__.py b/ppdet/modeling/heads/__init__.py
@@ -39,9 +39,11 @@
 from . import ppyoloe_contrast_head
 from . import centertrack_head
 from . import sparse_roi_head
+from . import diffusion_det_head
 from . import vitpose_head
 from . import clrnet_head
 
+
 from .bbox_head import *
 from .mask_head import *
 from .yolo_head import *
@@ -70,5 +72,6 @@
 from .centertrack_head import *
 from .sparse_roi_head import *
 from .petr_head import *
+from .diffusion_det_head import *
 from .vitpose_head import *
-from .clrnet_head import *
+from .clrnet_head import *