[Feature] Support PGD and multi-view FCOS3D++ on Waymo (#2835)

Co-authored-by: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn> Co-authored-by: sjh <sunjiahao1999>
open-mmlab · Jan 4, 2024 · 88b8694 · 88b8694
1 parent 2dad86c
commit 88b8694
Show file tree

Hide file tree

Showing 16 changed files with 909 additions and 127 deletions.
diff --git a/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py b/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
@@ -0,0 +1,184 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(0.95, 1.05),
+        # ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
+    metric='LET_mAP',
+    load_type='fov_image_based',
+    result_prefix='./pgd_fov_pred')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py b/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
@@ -0,0 +1,191 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        # ratio_range=(1., 1.),
+        ratio_range=(0.95, 1.05),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='Resize3D',
+        scale_factor=0.65,
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='Resize3D',
+        scale_factor=0.65,
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    metric='LET_mAP',
+    load_type='mv_image_based',
+    result_prefix='./pgd_mv_pred',
+    nms_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=500,
+        nms_thr=0.05,
+        score_thr=0.001,
+        min_bbox_size=0,
+        max_per_frame=100))
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')