From 88b86943b550957ffbc838dc63d455c5ead2a1d3 Mon Sep 17 00:00:00 2001
From: Sun Jiahao <72679458+sunjiahao1999@users.noreply.github.com>
Date: Thu, 4 Jan 2024 21:19:29 +0800
Subject: [PATCH] [Feature] Support PGD and multi-view FCOS3D++ on Waymo 
 (#2835)

Co-authored-by: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Co-authored-by: sjh <sunjiahao1999>
---
 .../datasets/waymoD3-fov-mono3d-3class.py     | 184 ++++++++++++++++
 .../datasets/waymoD3-mv-mono3d-3class.py      | 191 +++++++++++++++++
 .../_base_/datasets/waymoD5-mv3d-3class.py    |  32 ++-
 configs/_base_/models/multiview_dfm.py        |  28 +--
 configs/mvfcos3d/README.md                    |  62 ++++++
 ...fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py} |   0
 ...dcn_centerhead_16xb2_waymoD5-3d-3class.py} |   0
 configs/pgd/README.md                         |  17 ++
 ...1_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py |   6 +-
 ..._gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py | 112 ++++++++++
 ...n_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py | 111 ++++++++++
 docs/en/user_guides/dataset_prepare.md        |  44 ++--
 docs/zh_cn/user_guides/dataset_prepare.md     |  44 ++--
 mmdet3d/models/dense_heads/pgd_head.py        |   2 -
 mmdet3d/models/detectors/dfm.py               |   7 +-
 mmdet3d/models/detectors/multiview_dfm.py     | 196 ++++++++++++++----
 16 files changed, 909 insertions(+), 127 deletions(-)
 create mode 100644 configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
 create mode 100644 configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
 create mode 100644 configs/mvfcos3d/README.md
 rename configs/{dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py => mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py} (100%)
 rename configs/{dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py => mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py} (100%)
 create mode 100644 configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
 create mode 100644 configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py
diff --git a/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py b/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
new file mode 100644
index 0000000000..b0142b9c94
--- /dev/null
+++ b/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
@@ -0,0 +1,184 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(0.95, 1.05),
+        # ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
+    metric='LET_mAP',
+    load_type='fov_image_based',
+    result_prefix='./pgd_fov_pred')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py b/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
new file mode 100644
index 0000000000..af245effa7
--- /dev/null
+++ b/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
@@ -0,0 +1,191 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        # ratio_range=(1., 1.),
+        ratio_range=(0.95, 1.05),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='Resize3D',
+        scale_factor=0.65,
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='Resize3D',
+        scale_factor=0.65,
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    metric='LET_mAP',
+    load_type='mv_image_based',
+    result_prefix='./pgd_mv_pred',
+    nms_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=500,
+        nms_thr=0.05,
+        score_thr=0.001,
+        min_bbox_size=0,
+        max_per_frame=100))
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/configs/_base_/datasets/waymoD5-mv3d-3class.py b/configs/_base_/datasets/waymoD5-mv3d-3class.py
index a9cd619da4..6ea2678993 100644
--- a/configs/_base_/datasets/waymoD5-mv3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-mv3d-3class.py
@@ -1,5 +1,5 @@
 # dataset settings
-# D3 in the config name means the whole dataset is divided into 3 folds
+# D5 in the config name means the whole dataset is divided into 5 folds
 # We only use one fold for efficient experiments
 dataset_type = 'WaymoDataset'
 data_root = 'data/waymo/kitti_format/'
@@ -19,7 +19,7 @@
 #      }))
 backend_args = None
 
-class_names = ['Car', 'Pedestrian', 'Cyclist']
+class_names = ['Pedestrian', 'Cyclist', 'Car']
 input_modality = dict(use_lidar=False, use_camera=True)
 point_cloud_range = [-35.0, -75.0, -2, 75.0, 75.0, 4]
 
@@ -30,7 +30,7 @@
         scale=(1248, 832),
         ratio_range=(0.95, 1.05),
         keep_ratio=True),
-    dict(type='RandomCrop3D', crop_size=(720, 1080)),
+    dict(type='RandomCrop3D', crop_size=(1080, 720)),
     dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5, flip_box3d=False),
 ]
 
@@ -70,7 +70,14 @@
         to_float32=True,
         backend_args=backend_args),
     dict(type='MultiViewWrapper', transforms=test_transforms),
-    dict(type='Pack3DDetInputs', keys=['img'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam',
+            'num_ref_frames', 'num_views'
+        ])
 ]
 # construct a pipeline for data and gt loading in show function
 # please keep its loading function consistent with test_pipeline (e.g. client)
@@ -80,7 +87,14 @@
         to_float32=True,
         backend_args=backend_args),
     dict(type='MultiViewWrapper', transforms=test_transforms),
-    dict(type='Pack3DDetInputs', keys=['img'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam',
+            'num_ref_frames', 'num_views'
+        ])
 ]
 metainfo = dict(classes=class_names)
 
@@ -103,6 +117,7 @@
         pipeline=train_pipeline,
         modality=input_modality,
         test_mode=False,
+        cam_sync_instances=True,
         metainfo=metainfo,
         box_type_3d='Lidar',
         load_interval=5,
@@ -149,7 +164,7 @@
             CAM_FRONT_RIGHT='training/image_2',
             CAM_SIDE_LEFT='training/image_3',
             CAM_SIDE_RIGHT='training/image_4'),
-        pipeline=eval_pipeline,
+        pipeline=test_pipeline,
         modality=input_modality,
         test_mode=True,
         metainfo=metainfo,
@@ -157,10 +172,7 @@
         backend_args=backend_args))
 val_evaluator = dict(
     type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
     waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
-    data_root='./data/waymo/waymo_format',
-    metric='LET_mAP',
-    backend_args=backend_args)
+    metric='LET_mAP')
 
 test_evaluator = val_evaluator
diff --git a/configs/_base_/models/multiview_dfm.py b/configs/_base_/models/multiview_dfm.py
index e6f4d276c7..7fa5376d1e 100644
--- a/configs/_base_/models/multiview_dfm.py
+++ b/configs/_base_/models/multiview_dfm.py
@@ -35,7 +35,7 @@
         type='AlignedAnchor3DRangeGenerator',
         ranges=[[-35.0, -75.0, -2, 75.0, 75.0, 4]],
         rotations=[.0]),
-    bbox_head=dict(
+    bbox_head_3d=dict(
         type='Anchor3DHead',
         num_classes=3,
         in_channels=256,
@@ -43,13 +43,13 @@
         use_direction_classifier=True,
         anchor_generator=dict(
             type='AlignedAnchor3DRangeGenerator',
-            ranges=[[-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345],
-                    [-35.0, -75.0, 0, 75.0, 75.0, 0],
-                    [-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188]],
+            ranges=[[-35.0, -75.0, 0, 75.0, 75.0, 0],
+                    [-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188],
+                    [-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345]],
             sizes=[
-                [4.73, 2.08, 1.77],  # car
                 [0.91, 0.84, 1.74],  # pedestrian
                 [1.81, 0.84, 1.77],  # cyclist
+                [4.73, 2.08, 1.77],  # car
             ],
             rotations=[0, 1.57],
             reshape_out=False),
@@ -69,13 +69,6 @@
             loss_weight=0.2)),
     train_cfg=dict(
         assigner=[
-            dict(  # for Car
-                type='Max3DIoUAssigner',
-                iou_calculator=dict(type='BboxOverlapsNearest3D'),
-                pos_iou_thr=0.6,
-                neg_iou_thr=0.45,
-                min_pos_iou=0.45,
-                ignore_iof_thr=-1),
             dict(  # for Pedestrian
                 type='Max3DIoUAssigner',
                 iou_calculator=dict(type='BboxOverlapsNearest3D'),
@@ -90,6 +83,13 @@
                 neg_iou_thr=0.35,
                 min_pos_iou=0.35,
                 ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1)
         ],
         allowed_border=0,
         pos_weight=-1,
@@ -100,5 +100,5 @@
         nms_thr=0.05,
         score_thr=0.001,
         min_bbox_size=0,
-        nms_pre=500,
-        max_num=100))
+        nms_pre=4096,
+        max_num=500))
diff --git a/configs/mvfcos3d/README.md b/configs/mvfcos3d/README.md
new file mode 100644
index 0000000000..934ec1ad6c
--- /dev/null
+++ b/configs/mvfcos3d/README.md
@@ -0,0 +1,62 @@
+# MV-FCOS3D++: Multi-View Camera-Only 4D Object Detection with Pretrained Monocular Backbones
+
+> [MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones](https://arxiv.org/abs/2207.12716)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this technical report, we present our solution, dubbed MV-FCOS3D++, for the Camera-Only 3D Detection track in Waymo Open Dataset Challenge 2022. For multi-view camera-only 3D detection, methods based on bird-eye-view or 3D geometric representations can leverage the stereo cues from overlapped regions between adjacent views and directly perform 3D detection without hand-crafted post-processing. However, it lacks direct semantic supervision for 2D backbones, which can be complemented by pretraining simple monocular-based detectors. Our solution is a multi-view framework for 4D detection following this paradigm. It is built upon a simple monocular detector FCOS3D++, pretrained only with object annotations of Waymo, and converts multi-view features to a 3D grid space to detect 3D objects thereon. A dual-path neck for single-frame understanding and temporal stereo matching is devised to incorporate multi-frame information. Our method finally achieves 49.75% mAPL with a single model and wins 2nd place in the WOD challenge, without any LiDAR-based depth supervision during training. The code will be released at [this https URL](https://github.com/Tai-Wang/Depth-from-Motion).
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection3d/assets/72679458/9313eb3c-cc41-40be-9ead-549b3b5fef44" width="800"/>
+</div>
+
+## Introduction
+
+We implement multi-view FCOS3D++ and provide the results on Waymo dataset.
+
+## Usage
+
+### Training commands
+
+1. You should train PGD first:
+
+```bash
+bash tools/dist_train.py configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py 8
+```
+
+2. Given pre-trained PGD backbone, you could train multi-view FCOS3D++:
+
+```bash
+bash tools/dist_train.sh configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py --cfg-options load_from=${PRETRAINED_CHECKPOINT}
+```
+
+**Note**:
+the path of `load_from` needs to be changed to yours accordingly.
+
+## Results and models
+
+### Waymo
+
+|                                Backbone                                | Load Interval | mAPL | mAP  | mAPH |                                                                                             Download                                                                                             |
+| :--------------------------------------------------------------------: | :-----------: | :--: | :--: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101+DCN](./multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py) |      5x       | 38.2 | 52.9 | 49.5 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class_20231127_122815.log) |
+|                              above @ Car                               |               | 56.5 | 73.3 | 72.3 |                                                                                                                                                                                                  |
+|                           above @ Pedestrian                           |               | 34.8 | 49.5 | 43.1 |                                                                                                                                                                                                  |
+|                            above @ Cyclist                             |               | 23.2 | 35.9 | 33.3 |                                                                                                                                                                                                  |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
+## Citation
+
+```latex
+@article{wang2022mvfcos3d++,
+  title={{MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones},
+  author={Wang, Tai and Lian, Qing and Zhu, Chenming and Zhu, Xinge and Zhang, Wenwei},
+  journal={arXiv preprint},
+  year={2022}
+}
+```
diff --git a/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py b/configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
similarity index 100%
rename from configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py
rename to configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
diff --git a/configs/dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py b/configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
similarity index 100%
rename from configs/dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
rename to configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
diff --git a/configs/pgd/README.md b/configs/pgd/README.md
index 2237a0f4ef..6c41522b3d 100644
--- a/configs/pgd/README.md
+++ b/configs/pgd/README.md
@@ -50,6 +50,23 @@ Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the
 | [above w/ finetune](./pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py) |   2x    |   9.20   | 35.8 | 42.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135.log.json) |
 |                                   above w/ tta                                    |   2x    |   9.20   | 36.8 | 43.1 |                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 
+### Waymo
+
+|                                   Backbone                                   | Load Interval |  Camera view  | mAPL | mAP  | mAPH  |                                                                                             Download                                                                                              |
+| :--------------------------------------------------------------------------: | :-----------: | :-----------: | :--: | :--: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py) |      3x       | front-of-view | 15.8 | 22.7 | 21.51 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d_20231107_164117.log) |
+|                                 above @ Car                                  |               |               | 36.7 | 51.6 | 51.0  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 9.0  | 14.1 | 11.4  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.6  | 2.5  |  2.2  |                                                                                                                                                                                                   |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py)  |      3x       |  multi-view   | 20.8 | 29.3 | 27.7  |  [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d_20231120_202732.log)  |
+|                                 above @ Car                                  |               |               | 41.2 | 56.1 | 55.2  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 20.0 | 29.6 | 25.8  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.4  | 2.2  |  2.0  |                                                                                                                                                                                                   |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
 ## Citation
 
 ```latex
diff --git a/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py b/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
index 856fd8f604..fa50e0e04f 100644
--- a/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
+++ b/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
@@ -68,9 +68,9 @@
             type='PGDBBoxCoder',
             base_depths=((41.01, 18.44), ),
             base_dims=(
-                (4.73, 1.77, 2.08),
-                (0.91, 1.74, 0.84),
-                (1.81, 1.77, 0.84),
+                (4.73, 1.77, 2.08),  # Car
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
             ),
             code_size=7)),
     # set weight 1.0 for base 7 dims (offset, depth, size, rot)
diff --git a/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
new file mode 100644
index 0000000000..f4c6193749
--- /dev/null
+++ b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
@@ -0,0 +1,112 @@
+_base_ = [
+    '../_base_/datasets/waymoD3-fov-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# load_from = '../Depth-from-Motion/checkpoints/pgd_init.pth'
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+                (4.73, 1.77, 2.08)),  # Car
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py
new file mode 100644
index 0000000000..2247aa44d8
--- /dev/null
+++ b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py
@@ -0,0 +1,111 @@
+_base_ = [
+    '../_base_/datasets/waymoD3-mv-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+                (4.73, 1.77, 2.08)),  # Car
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/docs/en/user_guides/dataset_prepare.md b/docs/en/user_guides/dataset_prepare.md
index d1b7cad14e..17f368a9d0 100644
--- a/docs/en/user_guides/dataset_prepare.md
+++ b/docs/en/user_guides/dataset_prepare.md
@@ -133,10 +133,12 @@ sh tools/create_data.sh <partition> kitti
 
 ### Waymo
 
-Download Waymo open dataset V1.4.1 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `.tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split `.txt` files into `data/waymo/kitti_format/ImageSets`. Download ground truth `.bin` file for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_4_1/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare waymo data by running:
+Download Waymo open dataset V1.4.1 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `.tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split `.txt` files into `data/waymo/kitti_format/ImageSets`. Download ground truth `.bin` file for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare waymo data by running:
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 Note that:
@@ -149,28 +151,12 @@ Note that:
 
 - **Ready-made Annotations**. We have provided the annotation files generated offline [here](#summary-of-annotation-files). However, the original Waymo data still needs to be converted to `kitti-format` data by yourself.
 
-- **Waymo-mini**. If you just want to use a part of Waymo Dataset to verify some methods or debug quickly, you could use our provided [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_kitti_format.tar.gz) which only contains two segments in train split and one segment in val split from the original dataset. All the images, point clouds and annotations in this compressed file have been processed offline so that you can directly download and unzip it to `data/waymo/`:
+- **Waymo-mini**. If you just want to use a part of Waymo Dataset to verify some methods or debug quickly, you could use our provided [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) which only contains two segments in train split and one segment in val split from the original dataset. All the images, point clouds and annotations in this compressed file have been processed offline so that you can directly download and unzip it to `data/waymo/`:
 
   ```bash
-  tar -xzvf waymo_mini_kitti_format.tar.gz -C ./data/waymo
+  tar -xzvf waymo_mini.tar.gz -C ./data/waymo_mini
   ```
 
-- **Faster evaluation**. If you want faster evaluation on Waymo, you can download the preprocessed [metainfo](https://download.openmmlab.com/mmdetection3d/data/waymo/idx2metainfo.pkl) containing `contextname` and `timestamp` to the directory `data/waymo/waymo_format/` and then modify the dataset config as the following:
-
-  ```python
-  val_evaluator = dict(
-      type='WaymoMetric',
-      ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-      waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-      data_root='./data/waymo/waymo_format',
-      backend_args=backend_args,
-      convert_kitti_format=True,
-      idx2metainfo='data/waymo/waymo_format/idx2metainfo.pkl'
-      )
-  ```
-
-  Now, this trick is only used for LiDAR-based detection methods.
-
 ### NuScenes
 
 1. Download nuScenes V1.0 full dataset data [HERE](https://www.nuscenes.org/download). Alternatively, you
@@ -272,12 +258,12 @@ python tools/dataset_converters/update_infos_to_v2.py --dataset kitti --pkl-path
 
 We provide ready-made annotation files we generated offline for reference. You can directly use these files for convenice.
 
-|                                                        Dataset                                                         |                                                                                                           Train annotation file                                                                                                           |                                                                                                        Val annotation file                                                                                                         |                                                    Test information file                                                     |
-| :--------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: |
-|                                                         KITTI                                                          |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |               [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)               |
-|                                                        NuScenes                                                        | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                              |
-|                                                         Waymo                                                          |         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_train.pkl)  [waymo_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_infos_train.pkl)          |          [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_val.pkl)  [waymo_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_infos_val.pkl)          |             [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)             |
-| [Waymo-mini kitti-format data](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_kitti_format.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                              |
-|                                                       SUN RGB-D                                                        |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                              |
-|                                                        ScanNet                                                         |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |          [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)          |
-|                                                     SemanticKitti                                                      |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     | [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl) |
+|                                                  Dataset                                                  |                                                                                                           Train annotation file                                                                                                           |                                                                                                        Val annotation file                                                                                                         |                                                                                                              Test information file                                                                                                              |
+| :-------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                                                   KITTI                                                   |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |                                                                        [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)                                                                         |
+|                                                 NuScenes                                                  | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                                                                                                                                                 |
+|                                                   Waymo                                                   |                                                         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_train.pkl)                                                          |                                                        [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_val.pkl)                                                        | [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)   [waymo_infos_test_cam_only.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_test_cam_only.pkl) |
+| [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                                                                                                                                                 |
+|                                                 SUN RGB-D                                                 |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                                                                                                                                                 |
+|                                                  ScanNet                                                  |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |                                                                   [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)                                                                    |
+|                                               SemanticKitti                                               |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     |                                                          [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl)                                                           |
diff --git a/docs/zh_cn/user_guides/dataset_prepare.md b/docs/zh_cn/user_guides/dataset_prepare.md
index 917015dcff..983e988afb 100644
--- a/docs/zh_cn/user_guides/dataset_prepare.md
+++ b/docs/zh_cn/user_guides/dataset_prepare.md
@@ -111,10 +111,12 @@ python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitt
 
 ### Waymo
 
-在[这里](https://waymo.com/open/download/)下载 Waymo 公开数据集 1.2 版本，在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其数据划分文件。然后，将 `.tfrecord` 文件置于 `data/waymo/waymo_format/` 目录下的相应位置，并将数据划分的 `.txt` 文件置于 `data/waymo/kitti_format/ImageSets` 目录下。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集的真实标签（`.bin` 文件）并将其置于 `data/waymo/waymo_format/`。提示：你可以使用 `gsutil` 来用命令下载大规模的数据集。更多细节请参考此[工具](https://github.com/RalphMao/Waymo-Dataset-Tool)。完成以上各步后，可以通过运行以下指令对 Waymo 数据进行预处理：
+在[这里](https://waymo.com/open/download/)下载 Waymo 公开数据集 1.4.1 版本，在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其数据划分文件。然后，将 `.tfrecord` 文件置于 `data/waymo/waymo_format/` 目录下的相应位置，并将数据划分的 `.txt` 文件置于 `data/waymo/kitti_format/ImageSets` 目录下。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集的真实标签（`.bin` 文件）并将其置于 `data/waymo/waymo_format/`。提示：你可以使用 `gsutil` 来用命令下载大规模的数据集。更多细节请参考此[工具](https://github.com/RalphMao/Waymo-Dataset-Tool)。完成以上各步后，可以通过运行以下指令对 Waymo 数据进行预处理：
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 注意:
@@ -125,28 +127,12 @@ python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/way
 
 - **现成的标注文件**: 我们已经提供了离线处理好的 [Waymo 标注文件](#数据集标注文件列表)。您直接下载他们并放到 `data/waymo/kitti_format/` 目录下。然而，您还是需要自己使用上面的脚本将 Waymo 的原始数据还需要转成 kitti 格式。
 
-- **Waymo-mini**： 如果你只是为了验证某些方法或者 debug, 你可以使用我们提供的 [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_kitti_format.tar.gz)。它只包含原始数据集中训练集中的 2 个 segments 和 验证集中的 1 个 segment。您只需要下载并且解压到 `data/waymo/`，即可使用它：
+- **Waymo-mini**： 如果你只是为了验证某些方法或者 debug, 你可以使用我们提供的 [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz)。它只包含原始数据集中训练集中的 2 个 segments 和 验证集中的 1 个 segment。您只需要下载并且解压到 `data/waymo_mini/`，即可使用它：
 
   ```bash
-  tar -xzvf waymo_mini_kitti_format.tar.gz -C ./data/waymo
+  tar -xzvf waymo_mini.tar.gz -C ./data/waymo_mini
   ```
 
-- **更快的评估**： 如果你想在 Waymo 上进行更快的评估，你可以下载已经预处理好的[元信息文件](https://download.openmmlab.com/mmdetection3d/data/waymo/idx2metainfo.pkl)并将其放置在 `data/waymo/waymo_format/` 目录下。接着，你可以按照以下来修改数据集的配置：
-
-  ```python
-  val_evaluator = dict(
-      type='WaymoMetric',
-      ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-      waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-      data_root='./data/waymo/waymo_format',
-      backend_args=backend_args,
-      convert_kitti_format=True,
-      idx2metainfo='data/waymo/waymo_format/idx2metainfo.pkl'
-      )
-  ```
-
-  目前这种方式仅限于纯点云检测任务。
-
 ### NuScenes
 
 在[这里](https://www.nuscenes.org/download)下载 nuScenes 数据集 1.0 版本的完整数据文件。通过运行以下指令对 nuScenes 数据进行预处理：
@@ -222,12 +208,12 @@ python tools/dataset_converters/update_infos_to_v2.py --dataset kitti --pkl-path
 
 我们提供了离线生成好的数据集标注文件以供参考。为了方便，您也可以直接使用他们。
 
-|                                                         数据集                                                         |                                                                                                              训练集标注文件                                                                                                               |                                                                                                           验证集标注文件                                                                                                           |                                                        测试集标注文件                                                        |
-| :--------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: |
-|                                                         KITTI                                                          |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |               [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)               |
-|                                                        NuScenes                                                        | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                              |
-|                                                         Waymo                                                          |         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_train.pkl)  [waymo_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_infos_train.pkl)          |          [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_val.pkl)  [waymo_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_infos_val.pkl)          |             [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)             |
-| [Waymo-mini kitti-format data](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_kitti_format.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                              |
-|                                                       SUN RGB-D                                                        |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                              |
-|                                                        ScanNet                                                         |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |          [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)          |
-|                                                     SemanticKitti                                                      |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     | [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl) |
+|                                                  数据集                                                   |                                                                                                              训练集标注文件                                                                                                               |                                                                                                           验证集标注文件                                                                                                           |                                                                                                                 测试集标注文件                                                                                                                  |
+| :-------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                                                   KITTI                                                   |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |                                                                        [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)                                                                         |
+|                                                 NuScenes                                                  | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                                                                                                                                                 |
+|                                                   Waymo                                                   |                                                         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_train.pkl)                                                          |                                                        [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_val.pkl)                                                        | [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)   [waymo_infos_test_cam_only.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_test_cam_only.pkl) |
+| [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                                                                                                                                                 |
+|                                                 SUN RGB-D                                                 |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                                                                                                                                                 |
+|                                                  ScanNet                                                  |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |                                                                   [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)                                                                    |
+|                                               SemanticKitti                                               |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     |                                                          [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl)                                                           |
diff --git a/mmdet3d/models/dense_heads/pgd_head.py b/mmdet3d/models/dense_heads/pgd_head.py
index c33ddb9e24..4a733b8691 100644
--- a/mmdet3d/models/dense_heads/pgd_head.py
+++ b/mmdet3d/models/dense_heads/pgd_head.py
@@ -1032,8 +1032,6 @@ def _predict_by_feat_single(self,
             # change the offset to actual center predictions
             bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2]
             if rescale:
-                bbox_pred3d[:, :2] /= bbox_pred3d[:, :2].new_tensor(
-                    scale_factor[0])
                 if self.pred_bbox2d:
                     bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor[0])
             if self.use_depth_classifier:
diff --git a/mmdet3d/models/detectors/dfm.py b/mmdet3d/models/detectors/dfm.py
index 736e43e826..7494c8546f 100644
--- a/mmdet3d/models/detectors/dfm.py
+++ b/mmdet3d/models/detectors/dfm.py
@@ -13,6 +13,9 @@ class DfM(BaseDetector):
         <https://arxiv.org/abs/2207.12988>`_.
 
     Args:
+        data_preprocessor (:obj:`ConfigDict` or dict): The pre-process
+            config of :class:`BaseDataPreprocessor`. it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
         backbone (:obj:`ConfigDict` or dict): The backbone config.
         neck (:obj:`ConfigDict` or dict): The neck config.
         backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
@@ -39,6 +42,7 @@ class DfM(BaseDetector):
     """
 
     def __init__(self,
+                 data_preprocessor: ConfigType,
                  backbone: ConfigType,
                  neck: ConfigType,
                  backbone_stereo: ConfigType,
@@ -53,7 +57,8 @@ def __init__(self,
                  test_cfg=None,
                  pretrained=None,
                  init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
         self.backbone = MODELS.build(backbone)
         self.neck = MODELS.build(neck)
         if backbone_stereo is not None:
diff --git a/mmdet3d/models/detectors/multiview_dfm.py b/mmdet3d/models/detectors/multiview_dfm.py
index fce4c92014..81446d30f2 100644
--- a/mmdet3d/models/detectors/multiview_dfm.py
+++ b/mmdet3d/models/detectors/multiview_dfm.py
@@ -1,20 +1,22 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
 import numpy as np
 import torch
-from mmdet.models.detectors import BaseDetector
+from mmengine.structures import InstanceData
+from torch import Tensor
 
 from mmdet3d.models.layers.fusion_layers.point_fusion import (point_sample,
                                                               voxel_sample)
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures.bbox_3d.utils import get_lidar2img
 from mmdet3d.structures.det3d_data_sample import SampleList
-from mmdet3d.utils import ConfigType, OptConfigType
+from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList
 from .dfm import DfM
-from .imvoxelnet import ImVoxelNet
 
 
 @MODELS.register_module()
-class MultiViewDfM(ImVoxelNet, DfM):
+class MultiViewDfM(DfM):
     r"""Waymo challenge solution of `MV-FCOS3D++
     <https://arxiv.org/abs/2207.12716>`_.
 
@@ -25,7 +27,7 @@ class MultiViewDfM(ImVoxelNet, DfM):
         config.
         backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
         neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
-        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        bbox_head_3d (:obj:`ConfigDict` or dict): The bbox head config.
         voxel_size (:obj:`ConfigDict` or dict): The voxel size.
         anchor_generator (:obj:`ConfigDict` or dict): The anchor generator
             config.
@@ -60,7 +62,7 @@ def __init__(self,
                  backbone_stereo: ConfigType,
                  backbone_3d: ConfigType,
                  neck_3d: ConfigType,
-                 bbox_head: ConfigType,
+                 bbox_head_3d: ConfigType,
                  voxel_size: ConfigType,
                  anchor_generator: ConfigType,
                  neck_2d: ConfigType = None,
@@ -71,41 +73,24 @@ def __init__(self,
                  test_cfg: OptConfigType = None,
                  data_preprocessor: OptConfigType = None,
                  valid_sample: bool = True,
-                 temporal_aggregate: str = 'concat',
+                 temporal_aggregate: str = 'mean',
                  transform_depth: bool = True,
                  init_cfg: OptConfigType = None):
-        # TODO merge with DFM
-        BaseDetector.__init__(
-            self, data_preprocessor=data_preprocessor, init_cfg=init_cfg)
-
-        self.backbone = MODELS.build(backbone)
-        self.neck = MODELS.build(neck)
-        if backbone_stereo is not None:
-            backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
-            backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
-            self.backbone_stereo = MODELS.build(backbone_stereo)
-            assert self.neck.cat_img_feature == \
-                self.backbone_stereo.cat_img_feature
-            assert self.neck.sem_channels[
-                -1] == self.backbone_stereo.in_sem_channels
-        if backbone_3d is not None:
-            self.backbone_3d = MODELS.build(backbone_3d)
-        if neck_3d is not None:
-            self.neck_3d = MODELS.build(neck_3d)
-        if neck_2d is not None:
-            self.neck_2d = MODELS.build(neck_2d)
-        if bbox_head_2d is not None:
-            self.bbox_head_2d = MODELS.build(bbox_head_2d)
-        if depth_head_2d is not None:
-            self.depth_head_2d = MODELS.build(depth_head_2d)
-        if depth_head is not None:
-            self.depth_head = MODELS.build(depth_head)
-            self.depth_samples = self.depth_head.depth_samples
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        bbox_head.update(train_cfg=train_cfg)
-        bbox_head.update(test_cfg=test_cfg)
-        self.bbox_head = MODELS.build(bbox_head)
+        super().__init__(
+            data_preprocessor=data_preprocessor,
+            backbone=backbone,
+            neck=neck,
+            backbone_stereo=backbone_stereo,
+            backbone_3d=backbone_3d,
+            neck_3d=neck_3d,
+            bbox_head_3d=bbox_head_3d,
+            neck_2d=neck_2d,
+            bbox_head_2d=bbox_head_2d,
+            depth_head_2d=depth_head_2d,
+            depth_head=depth_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
         self.voxel_size = voxel_size
         self.voxel_range = anchor_generator['ranges'][0]
         self.n_voxels = [
@@ -371,6 +356,139 @@ def feature_transformation(self, batch_feats, batch_img_metas, num_views,
             transform_feats += (batch_stereo_feats, )
         return transform_feats
 
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        losses = self.bbox_head_3d.loss([bev_feat], batch_data_samples)
+        return losses
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+                (num_instances, C) where C >=7.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        results_list = self.bbox_head_3d.predict([bev_feat],
+                                                 batch_data_samples)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
+        return predictions
+
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: SampleList = None):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        self.bbox_head.forward(bev_feat, batch_data_samples)
+
+    def add_pred_to_datasample(
+        self,
+        data_samples: SampleList,
+        data_instances_3d: OptInstanceList = None,
+        data_instances_2d: OptInstanceList = None,
+    ) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Subclasses could override it to be compatible for some multi-modality
+        3D detectors.
+
+        Args:
+            data_samples (list[:obj:`Det3DDataSample`]): The input data.
+            data_instances_3d (list[:obj:`InstanceData`], optional): 3D
+                Detection results of each sample.
+            data_instances_2d (list[:obj:`InstanceData`], optional): 2D
+                Detection results of each sample.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+
+            When there are image prediction in some models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+
+        assert (data_instances_2d is not None) or \
+               (data_instances_3d is not None),\
+               'please pass at least one type of data_samples'
+
+        if data_instances_2d is None:
+            data_instances_2d = [
+                InstanceData() for _ in range(len(data_instances_3d))
+            ]
+        if data_instances_3d is None:
+            data_instances_3d = [
+                InstanceData() for _ in range(len(data_instances_2d))
+            ]
+
+        for i, data_sample in enumerate(data_samples):
+            data_sample.pred_instances_3d = data_instances_3d[i]
+            data_sample.pred_instances = data_instances_2d[i]
+        return data_samples
+
     def aug_test(self, imgs, img_metas, **kwargs):
         """Test with augmentations.