From bfb10b6049670568d9956c31236afc0489b3cb12 Mon Sep 17 00:00:00 2001 From: sunjiahao1999 <578431509@qq.com> Date: Mon, 4 Dec 2023 13:03:32 +0800 Subject: [PATCH] add mvfcos3d --- .../datasets/waymoD3-fov-mono3d-3class.py | 2 +- .../datasets/waymoD3-mv-mono3d-3class.py | 20 +- .../_base_/datasets/waymoD5-mv3d-3class.py | 26 ++- configs/_base_/models/multiview_dfm.py | 29 +-- ...ew-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py | 2 +- ...n_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py | 30 +++ mmdet3d/datasets/transforms/transforms_3d.py | 1 + mmdet3d/models/detectors/dfm.py | 6 +- mmdet3d/models/detectors/multiview_dfm.py | 195 ++++++++++++++---- 9 files changed, 240 insertions(+), 71 deletions(-) diff --git a/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py b/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py index 73020cf478..9456a6705e 100644 --- a/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py +++ b/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py @@ -35,7 +35,7 @@ # base shape (1248, 832), scale (0.95, 1.05) dict( type='RandomResize3D', - scale=(1284, 832), + scale=(1248, 832), ratio_range=(0.95, 1.05), # ratio_range=(1., 1.), interpolation='nearest', diff --git a/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py b/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py index 48001c0c34..7e18c9c021 100644 --- a/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py +++ b/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py @@ -2,7 +2,7 @@ # D3 in the config name means the whole dataset is divided into 3 folds # We only use one fold for efficient experiments dataset_type = 'WaymoDataset' -data_root = 'data/waymo_mini/kitti_format/' +data_root = 'data/waymo/kitti_format/' class_names = ['Pedestrian', 'Cyclist', 'Car'] metainfo = dict(classes=class_names) input_modality = dict(use_lidar=False, use_camera=True) @@ -35,11 +35,13 @@ # base shape (1248, 832), scale (0.95, 1.05) dict( type='RandomResize3D', - scale=(1284, 832), + scale=(1248, 832), + # ratio_range=(1., 1.), ratio_range=(0.95, 1.05), + interpolation='nearest', keep_ratio=True, ), - # dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='Pack3DDetInputs', keys=[ @@ -83,9 +85,9 @@ train_dataloader = dict( batch_size=3, - num_workers=0, - persistent_workers=False, - sampler=dict(type='DefaultSampler', shuffle=False), + num_workers=3, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type=dataset_type, data_root=data_root, @@ -170,9 +172,9 @@ val_evaluator = dict( type='WaymoMetric', - ann_file='./data/waymo_mini/kitti_format/waymo_infos_val.pkl', - waymo_bin_file='./data/waymo_mini/waymo_format/cam_gt_mini.bin', - pklfile_prefix='./waymo_mv_pred_fix_resize_2', + ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl', + waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin', + pklfile_prefix='./pgd_mv', metric='LET_mAP', convert_kitti_format=False, load_type='mv_image_based', diff --git a/configs/_base_/datasets/waymoD5-mv3d-3class.py b/configs/_base_/datasets/waymoD5-mv3d-3class.py index a9cd619da4..43ed90f9e6 100644 --- a/configs/_base_/datasets/waymoD5-mv3d-3class.py +++ b/configs/_base_/datasets/waymoD5-mv3d-3class.py @@ -19,7 +19,7 @@ # })) backend_args = None -class_names = ['Car', 'Pedestrian', 'Cyclist'] +class_names = ['Pedestrian', 'Cyclist', 'Car'] input_modality = dict(use_lidar=False, use_camera=True) point_cloud_range = [-35.0, -75.0, -2, 75.0, 75.0, 4] @@ -29,8 +29,9 @@ type='RandomResize3D', scale=(1248, 832), ratio_range=(0.95, 1.05), + # ratio_range=(1., 1.), keep_ratio=True), - dict(type='RandomCrop3D', crop_size=(720, 1080)), + dict(type='RandomCrop3D', crop_size=(1080, 720)), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5, flip_box3d=False), ] @@ -48,6 +49,9 @@ with_label_3d=True, with_bbox_depth=True), dict(type='MultiViewWrapper', transforms=train_transforms), + # randomness_keys= [ + # 'scale', 'scale_factor', 'crop_size', 'img_crop_offset', 'flip', + # 'flip_direction']), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict( @@ -70,7 +74,11 @@ to_float32=True, backend_args=backend_args), dict(type='MultiViewWrapper', transforms=test_transforms), - dict(type='Pack3DDetInputs', keys=['img']) + dict(type='Pack3DDetInputs', keys=['img'], meta_keys=[ + 'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor', + 'sample_idx', 'context_name', 'timestamp', 'lidar2cam', + 'num_ref_frames', 'num_views' + ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) @@ -80,7 +88,11 @@ to_float32=True, backend_args=backend_args), dict(type='MultiViewWrapper', transforms=test_transforms), - dict(type='Pack3DDetInputs', keys=['img']) + dict(type='Pack3DDetInputs', keys=['img'], meta_keys=[ + 'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor', + 'sample_idx', 'context_name', 'timestamp', 'lidar2cam', + 'num_ref_frames', 'num_views' + ]) ] metainfo = dict(classes=class_names) @@ -103,6 +115,7 @@ pipeline=train_pipeline, modality=input_modality, test_mode=False, + cam_sync_instances=True, metainfo=metainfo, box_type_3d='Lidar', load_interval=5, @@ -149,7 +162,7 @@ CAM_FRONT_RIGHT='training/image_2', CAM_SIDE_LEFT='training/image_3', CAM_SIDE_RIGHT='training/image_4'), - pipeline=eval_pipeline, + pipeline=test_pipeline, modality=input_modality, test_mode=True, metainfo=metainfo, @@ -159,7 +172,8 @@ type='WaymoMetric', ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl', waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin', - data_root='./data/waymo/waymo_format', + pklfile_prefix='./mmdet3d_mvfoc3d_pred', + convert_kitti_format=False, metric='LET_mAP', backend_args=backend_args) diff --git a/configs/_base_/models/multiview_dfm.py b/configs/_base_/models/multiview_dfm.py index e6f4d276c7..f20ab6fd84 100644 --- a/configs/_base_/models/multiview_dfm.py +++ b/configs/_base_/models/multiview_dfm.py @@ -35,7 +35,7 @@ type='AlignedAnchor3DRangeGenerator', ranges=[[-35.0, -75.0, -2, 75.0, 75.0, 4]], rotations=[.0]), - bbox_head=dict( + bbox_head_3d=dict( type='Anchor3DHead', num_classes=3, in_channels=256, @@ -43,13 +43,13 @@ use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', - ranges=[[-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345], - [-35.0, -75.0, 0, 75.0, 75.0, 0], - [-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188]], + ranges=[[-35.0, -75.0, 0, 75.0, 75.0, 0], + [-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188], + [-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345]], sizes=[ - [4.73, 2.08, 1.77], # car [0.91, 0.84, 1.74], # pedestrian [1.81, 0.84, 1.77], # cyclist + [4.73, 2.08, 1.77], # car ], rotations=[0, 1.57], reshape_out=False), @@ -69,13 +69,6 @@ loss_weight=0.2)), train_cfg=dict( assigner=[ - dict( # for Car - type='Max3DIoUAssigner', - iou_calculator=dict(type='BboxOverlapsNearest3D'), - pos_iou_thr=0.6, - neg_iou_thr=0.45, - min_pos_iou=0.45, - ignore_iof_thr=-1), dict( # for Pedestrian type='Max3DIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), @@ -90,6 +83,14 @@ neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), + dict( # for Car + type='Max3DIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + ], allowed_border=0, pos_weight=-1, @@ -100,5 +101,5 @@ nms_thr=0.05, score_thr=0.001, min_bbox_size=0, - nms_pre=500, - max_num=100)) + nms_pre=4096, + max_num=500)) diff --git a/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py b/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py index b75a6db5b3..cabc368da7 100644 --- a/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py +++ b/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py @@ -44,6 +44,6 @@ ) log_level = 'INFO' -load_from = None +load_from = 'work_dirs/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d/epoch_24.pth' resume = False find_unused_parameters = True # only 1 of 4 FPN outputs is used diff --git a/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py index c5ca373776..12fc725df9 100644 --- a/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py +++ b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py @@ -79,3 +79,33 @@ 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0 ]), test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20)) + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.008, + ), + paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.), + clip_grad=dict(max_norm=35, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=24, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +auto_scale_lr = dict(enable=False, base_batch_size=48) \ No newline at end of file diff --git a/mmdet3d/datasets/transforms/transforms_3d.py b/mmdet3d/datasets/transforms/transforms_3d.py index 94275d42de..18b7e50c7b 100644 --- a/mmdet3d/datasets/transforms/transforms_3d.py +++ b/mmdet3d/datasets/transforms/transforms_3d.py @@ -2071,6 +2071,7 @@ def _crop_data(self, offset_w = np.random.randint( self.rel_offset_w[0] * margin_w, self.rel_offset_w[1] * margin_w + 1) + # offset_h, offset_w = 0, 0 else: offset_w, offset_h = results['img_crop_offset'] diff --git a/mmdet3d/models/detectors/dfm.py b/mmdet3d/models/detectors/dfm.py index 736e43e826..3f6118dcaf 100644 --- a/mmdet3d/models/detectors/dfm.py +++ b/mmdet3d/models/detectors/dfm.py @@ -13,6 +13,9 @@ class DfM(BaseDetector): `_. Args: + data_preprocessor (:obj:`ConfigDict` or dict): The pre-process + config of :class:`BaseDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. backbone (:obj:`ConfigDict` or dict): The backbone config. neck (:obj:`ConfigDict` or dict): The neck config. backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone @@ -39,6 +42,7 @@ class DfM(BaseDetector): """ def __init__(self, + data_preprocessor: ConfigType, backbone: ConfigType, neck: ConfigType, backbone_stereo: ConfigType, @@ -53,7 +57,7 @@ def __init__(self, test_cfg=None, pretrained=None, init_cfg=None): - super().__init__(init_cfg=init_cfg) + super().__init__(data_preprocessor= data_preprocessor,init_cfg=init_cfg) self.backbone = MODELS.build(backbone) self.neck = MODELS.build(neck) if backbone_stereo is not None: diff --git a/mmdet3d/models/detectors/multiview_dfm.py b/mmdet3d/models/detectors/multiview_dfm.py index fce4c92014..c195fbc47f 100644 --- a/mmdet3d/models/detectors/multiview_dfm.py +++ b/mmdet3d/models/detectors/multiview_dfm.py @@ -1,20 +1,21 @@ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch -from mmdet.models.detectors import BaseDetector +from torch import Tensor +from typing import Union from mmdet3d.models.layers.fusion_layers.point_fusion import (point_sample, voxel_sample) from mmdet3d.registry import MODELS, TASK_UTILS from mmdet3d.structures.bbox_3d.utils import get_lidar2img from mmdet3d.structures.det3d_data_sample import SampleList +from mmengine.structures import InstanceData from mmdet3d.utils import ConfigType, OptConfigType from .dfm import DfM -from .imvoxelnet import ImVoxelNet - +from mmdet3d.utils.typing_utils import OptConfigType, OptInstanceList @MODELS.register_module() -class MultiViewDfM(ImVoxelNet, DfM): +class MultiViewDfM(DfM): r"""Waymo challenge solution of `MV-FCOS3D++ `_. @@ -25,7 +26,7 @@ class MultiViewDfM(ImVoxelNet, DfM): config. backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config. neck_3d (:obj:`ConfigDict` or dict): The 3D neck config. - bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + bbox_head_3d (:obj:`ConfigDict` or dict): The bbox head config. voxel_size (:obj:`ConfigDict` or dict): The voxel size. anchor_generator (:obj:`ConfigDict` or dict): The anchor generator config. @@ -60,7 +61,7 @@ def __init__(self, backbone_stereo: ConfigType, backbone_3d: ConfigType, neck_3d: ConfigType, - bbox_head: ConfigType, + bbox_head_3d: ConfigType, voxel_size: ConfigType, anchor_generator: ConfigType, neck_2d: ConfigType = None, @@ -71,41 +72,24 @@ def __init__(self, test_cfg: OptConfigType = None, data_preprocessor: OptConfigType = None, valid_sample: bool = True, - temporal_aggregate: str = 'concat', + temporal_aggregate: str = 'mean', transform_depth: bool = True, init_cfg: OptConfigType = None): - # TODO merge with DFM - BaseDetector.__init__( - self, data_preprocessor=data_preprocessor, init_cfg=init_cfg) - - self.backbone = MODELS.build(backbone) - self.neck = MODELS.build(neck) - if backbone_stereo is not None: - backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature) - backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1]) - self.backbone_stereo = MODELS.build(backbone_stereo) - assert self.neck.cat_img_feature == \ - self.backbone_stereo.cat_img_feature - assert self.neck.sem_channels[ - -1] == self.backbone_stereo.in_sem_channels - if backbone_3d is not None: - self.backbone_3d = MODELS.build(backbone_3d) - if neck_3d is not None: - self.neck_3d = MODELS.build(neck_3d) - if neck_2d is not None: - self.neck_2d = MODELS.build(neck_2d) - if bbox_head_2d is not None: - self.bbox_head_2d = MODELS.build(bbox_head_2d) - if depth_head_2d is not None: - self.depth_head_2d = MODELS.build(depth_head_2d) - if depth_head is not None: - self.depth_head = MODELS.build(depth_head) - self.depth_samples = self.depth_head.depth_samples - self.train_cfg = train_cfg - self.test_cfg = test_cfg - bbox_head.update(train_cfg=train_cfg) - bbox_head.update(test_cfg=test_cfg) - self.bbox_head = MODELS.build(bbox_head) + super().__init__( + data_preprocessor=data_preprocessor, + backbone=backbone, + neck=neck, + backbone_stereo=backbone_stereo, + backbone_3d=backbone_3d, + neck_3d=neck_3d, + bbox_head_3d=bbox_head_3d, + neck_2d=neck_2d, + bbox_head_2d=bbox_head_2d, + depth_head_2d=depth_head_2d, + depth_head=depth_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) self.voxel_size = voxel_size self.voxel_range = anchor_generator['ranges'][0] self.n_voxels = [ @@ -371,6 +355,139 @@ def feature_transformation(self, batch_feats, batch_img_metas, num_views, transform_feats += (batch_stereo_feats, ) return transform_feats + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, tuple]: + """Calculate losses from a batch of inputs dict and data samples. + + Args: + batch_inputs_dict (dict): The model input dict which include + 'points', 'img' keys. + + - points (list[torch.Tensor]): Point cloud of each sample. + - imgs (torch.Tensor, optional): Image of each sample. + + batch_data_samples (List[:obj:`Det3DDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`. + + Returns: + dict: A dictionary of loss components. + """ + feats = self.extract_feat(batch_inputs, batch_data_samples) + bev_feat = feats[0] + losses = self.bbox_head_3d.loss([bev_feat], batch_data_samples) + return losses + + def predict(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs_dict (dict): The model input dict which include + 'points', 'imgs' keys. + + - points (list[torch.Tensor]): Point cloud of each sample. + - imgs (torch.Tensor, optional): Image of each sample. + + batch_data_samples (List[:obj:`Det3DDataSample`]): The Data + samples. It usually includes information such as + `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`. + + Returns: + list[:obj:`Det3DDataSample`]: Detection results of the + input samples. Each Det3DDataSample usually contain + 'pred_instances_3d'. And the ``pred_instances_3d`` usually + contains following keys. + + - scores_3d (Tensor): Classification scores, has a shape + (num_instance, ) + - labels_3d (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes_3d (Tensor): Contains a tensor with shape + (num_instances, C) where C >=7. + """ + feats = self.extract_feat(batch_inputs, batch_data_samples) + bev_feat = feats[0] + results_list = self.bbox_head_3d.predict([bev_feat], batch_data_samples) + predictions = self.add_pred_to_datasample(batch_data_samples, + results_list) + return predictions + + def _forward(self, + batch_inputs: Tensor, + batch_data_samples: SampleList = None): + """Network forward process. + + Usually includes backbone, neck and head forward without any post- + processing. + """ + feats = self.extract_feat(batch_inputs, batch_data_samples) + bev_feat = feats[0] + self.bbox_head.forward(bev_feat, batch_data_samples) + + + def add_pred_to_datasample( + self, + data_samples: SampleList, + data_instances_3d: OptInstanceList = None, + data_instances_2d: OptInstanceList = None, + ) -> SampleList: + """Convert results list to `Det3DDataSample`. + + Subclasses could override it to be compatible for some multi-modality + 3D detectors. + + Args: + data_samples (list[:obj:`Det3DDataSample`]): The input data. + data_instances_3d (list[:obj:`InstanceData`], optional): 3D + Detection results of each sample. + data_instances_2d (list[:obj:`InstanceData`], optional): 2D + Detection results of each sample. + + Returns: + list[:obj:`Det3DDataSample`]: Detection results of the + input. Each Det3DDataSample usually contains + 'pred_instances_3d'. And the ``pred_instances_3d`` normally + contains following keys. + + - scores_3d (Tensor): Classification scores, has a shape + (num_instance, ) + - labels_3d (Tensor): Labels of 3D bboxes, has a shape + (num_instances, ). + - bboxes_3d (Tensor): Contains a tensor with shape + (num_instances, C) where C >=7. + + When there are image prediction in some models, it should + contains `pred_instances`, And the ``pred_instances`` normally + contains following keys. + + - scores (Tensor): Classification scores of image, has a shape + (num_instance, ) + - labels (Tensor): Predict Labels of 2D bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Contains a tensor with shape + (num_instances, 4). + """ + + assert (data_instances_2d is not None) or \ + (data_instances_3d is not None),\ + 'please pass at least one type of data_samples' + + if data_instances_2d is None: + data_instances_2d = [ + InstanceData() for _ in range(len(data_instances_3d)) + ] + if data_instances_3d is None: + data_instances_3d = [ + InstanceData() for _ in range(len(data_instances_2d)) + ] + + for i, data_sample in enumerate(data_samples): + data_sample.pred_instances_3d = data_instances_3d[i] + data_sample.pred_instances = data_instances_2d[i] + return data_samples + def aug_test(self, imgs, img_metas, **kwargs): """Test with augmentations.