From 40bd7b1b27eeb2d4a6316f37bebd268e0248c9d6 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Sun, 8 Oct 2023 13:59:13 +0800
Subject: [PATCH 01/11] fix lidar demo

---
 demo/new_pcd_demo.py                          |  96 ++++++++++
 .../apis/inferencers/base_3d_inferencer.py    | 177 ++++++++++++------
 .../inferencers/lidar_det3d_inferencer.py     |  85 +++++++--
 mmdet3d/visualization/local_visualizer.py     |   8 +-
 4 files changed, 294 insertions(+), 72 deletions(-)
 create mode 100644 demo/new_pcd_demo.py

diff --git a/demo/new_pcd_demo.py b/demo/new_pcd_demo.py
new file mode 100644
index 0000000000..9dd80979c7
--- /dev/null
+++ b/demo/new_pcd_demo.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait_time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = LidarDet3DInferencer(**init_args)
+    # inferencer(**call_args)
+
+    inputs = call_args.pop('inputs')
+    _, _, visualize_kwargs, _ = inferencer._dispatch_kwargs(**call_args)
+    inputs = inferencer._inputs_to_list(inputs)
+    inferencer.visualize_preds_fromfile(inputs, ['outputs/preds/000008.json'],
+                                        **visualize_kwargs)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmdet3d/apis/inferencers/base_3d_inferencer.py b/mmdet3d/apis/inferencers/base_3d_inferencer.py
index 5bf411a8d4..26004512a3 100644
--- a/mmdet3d/apis/inferencers/base_3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/base_3d_inferencer.py
@@ -1,18 +1,24 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+from copy import deepcopy
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
-import mmengine
 import numpy as np
 import torch.nn as nn
+from mmengine import dump, print_log
 from mmengine.fileio import (get_file_backend, isdir, join_path,
                              list_dir_or_file)
 from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model.utils import revert_sync_batchnorm
 from mmengine.registry import init_default_scope
 from mmengine.runner import load_checkpoint
 from mmengine.structures import InstanceData
 from mmengine.visualization import Visualizer
+from rich.progress import track
 
-from mmdet3d.registry import MODELS
+from mmdet3d.registry import DATASETS, MODELS
+from mmdet3d.structures import Box3DMode, Det3DDataSample
 from mmdet3d.utils import ConfigType
 
 InstanceList = List[InstanceData]
@@ -48,10 +54,10 @@ class Base3DInferencer(BaseInferencer):
     forward_kwargs: set = set()
     visualize_kwargs: set = {
         'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
+        'img_out_dir', 'no_save_vis'
     }
     postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
+        'print_result', 'pred_out_dir', 'return_datasample', 'no_save_pred'
     }
 
     def __init__(self,
@@ -60,10 +66,14 @@ def __init__(self,
                  device: Optional[str] = None,
                  scope: str = 'mmdet3d',
                  palette: str = 'none') -> None:
+        # A global counter tracking the number of frames processed, for
+        # naming of the output results
+        self.num_predicted_frames = 0
         self.palette = palette
         init_default_scope(scope)
         super().__init__(
             model=model, weights=weights, device=device, scope=scope)
+        self.model = revert_sync_batchnorm(self.model)
 
     def _convert_syncbn(self, cfg: ConfigType):
         """Convert config's naiveSyncBN to BN.
@@ -108,6 +118,14 @@ def _init_model(
             if 'PALETTE' in checkpoint.get('meta', {}):  # 3D Segmentor
                 model.dataset_meta['palette'] = checkpoint['meta']['PALETTE']
 
+        test_dataset_cfg = deepcopy(cfg.test_dataloader.dataset)
+        # lazy init. We only need the metainfo.
+        test_dataset_cfg['lazy_init'] = True
+        metainfo = DATASETS.build(test_dataset_cfg).metainfo
+        cfg_palette = metainfo.get('palette', None)
+        if cfg_palette is not None:
+            model.dataset_meta['palette'] = cfg_palette
+
         model.cfg = cfg  # save the config in the model for convenience
         model.to(device)
         model.eval()
@@ -173,64 +191,76 @@ def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]:
         visualizer.dataset_meta = self.model.dataset_meta
         return visualizer
 
+    def _dispatch_kwargs(self,
+                         out_dir: str = '',
+                         **kwargs) -> Tuple[Dict, Dict, Dict, Dict]:
+        """Dispatch kwargs to preprocess(), forward(), visualize() and
+        postprocess() according to the actual demands.
+
+        Args:
+            out_dir (str): Dir to save the inference results.
+            **kwargs (dict): Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            Tuple[Dict, Dict, Dict, Dict]: kwargs passed to preprocess,
+            forward, visualize and postprocess respectively.
+        """
+        kwargs['img_out_dir'] = out_dir
+        kwargs['pred_out_dir'] = out_dir
+        return super()._dispatch_kwargs(**kwargs)
+
     def __call__(self,
                  inputs: InputsType,
+                 batch_size=1,
                  return_datasamples: bool = False,
-                 batch_size: int = 1,
-                 return_vis: bool = False,
-                 show: bool = False,
-                 wait_time: int = 0,
-                 draw_pred: bool = True,
-                 pred_score_thr: float = 0.3,
-                 img_out_dir: str = '',
-                 print_result: bool = False,
-                 pred_out_file: str = '',
-                 **kwargs) -> dict:
+                 **kwargs) -> Optional[dict]:
         """Call the inferencer.
 
         Args:
             inputs (InputsType): Inputs for the inferencer.
             return_datasamples (bool): Whether to return results as
                 :obj:`BaseDataElement`. Defaults to False.
-            batch_size (int): Inference batch size. Defaults to 1.
-            return_vis (bool): Whether to return the visualization result.
-                Defaults to False.
-            show (bool): Whether to display the visualization results in a
-                popup window. Defaults to False.
-            wait_time (float): The interval of show (s). Defaults to 0.
-            draw_pred (bool): Whether to draw predicted bounding boxes.
-                Defaults to True.
-            pred_score_thr (float): Minimum score of bboxes to draw.
-                Defaults to 0.3.
-            img_out_dir (str): Output directory of visualization results.
-                If left as empty, no file will be saved. Defaults to ''.
-            print_result (bool): Whether to print the inference result w/o
-                visualization to the console. Defaults to False.
-            pred_out_file (str): File to save the inference results w/o
-                visualization. If left as empty, no file will be saved.
-                Defaults to ''.
-            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+            batch_size (int): Batch size. Defaults to 1.
+            **kwargs: Key words arguments passed to :meth:`preprocess`,
                 :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
                 Each key in kwargs should be in the corresponding set of
                 ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
                 and ``postprocess_kwargs``.
 
+
         Returns:
             dict: Inference and visualization results.
         """
-        return super().__call__(
-            inputs,
-            return_datasamples,
-            batch_size,
-            return_vis=return_vis,
-            show=show,
-            wait_time=wait_time,
-            draw_pred=draw_pred,
-            pred_score_thr=pred_score_thr,
-            img_out_dir=img_out_dir,
-            print_result=print_result,
-            pred_out_file=pred_out_file,
-            **kwargs)
+
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        ori_inputs = self._inputs_to_list(inputs)
+        inputs = self.preprocess(
+            ori_inputs, batch_size=batch_size, **preprocess_kwargs)
+        preds = []
+
+        results_dict = {'predictions': [], 'visualization': []}
+        for data in (track(inputs, description='Inference')
+                     if self.show_progress else inputs):
+            preds.extend(self.forward(data, **forward_kwargs))
+            visualization = self.visualize(ori_inputs, preds,
+                                           **visualize_kwargs)
+            results = self.postprocess(preds, visualization,
+                                       return_datasamples,
+                                       **postprocess_kwargs)
+            results_dict['predictions'].extend(results['predictions'])
+            if results['visualization'] is not None:
+                results_dict['visualization'].extend(results['visualization'])
+        return results_dict
 
     def postprocess(
         self,
@@ -238,7 +268,8 @@ def postprocess(
         visualization: Optional[List[np.ndarray]] = None,
         return_datasample: bool = False,
         print_result: bool = False,
-        pred_out_file: str = '',
+        no_save_pred: bool = False,
+        pred_out_dir: str = '',
     ) -> Union[ResType, Tuple[ResType, np.ndarray]]:
         """Process the predictions and visualization results from ``forward``
         and ``visualize``.
@@ -258,7 +289,7 @@ def postprocess(
                 Defaults to False.
             print_result (bool): Whether to print the inference result w/o
                 visualization to the console. Defaults to False.
-            pred_out_file (str): File to save the inference results w/o
+            pred_out_dir (str): Dir to save the inference results w/o
                 visualization. If left as empty, no file will be saved.
                 Defaults to ''.
 
@@ -273,35 +304,56 @@ def postprocess(
               json-serializable dict containing only basic data elements such
               as strings and numbers.
         """
+        if no_save_pred is True:
+            pred_out_dir = ''
+
         result_dict = {}
         results = preds
         if not return_datasample:
             results = []
             for pred in preds:
-                result = self.pred2dict(pred)
+                result = self.pred2dict(pred, pred_out_dir)
                 results.append(result)
+        elif pred_out_dir != '':
+            print_log(
+                'Currently does not support saving datasample '
+                'when return_datasample is set to True. '
+                'Prediction results are not saved!',
+                level=logging.WARNING)
+        # Add img to the results after printing and dumping
         result_dict['predictions'] = results
         if print_result:
             print(result_dict)
-        if pred_out_file != '':
-            mmengine.dump(result_dict, pred_out_file)
         result_dict['visualization'] = visualization
         return result_dict
 
-    def pred2dict(self, data_sample: InstanceData) -> Dict:
+    # TODO: The data format and fields saved in json need further discussion.
+    #  Maybe should include model name, timestamp, filename, image info etc.
+    def pred2dict(self,
+                  data_sample: Det3DDataSample,
+                  pred_out_dir: str = '') -> Dict:
         """Extract elements necessary to represent a prediction into a
         dictionary.
 
         It's better to contain only basic data elements such as strings and
         numbers in order to guarantee it's json-serializable.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): Predictions of the model.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Prediction results.
         """
         result = {}
         if 'pred_instances_3d' in data_sample:
             pred_instances_3d = data_sample.pred_instances_3d.numpy()
             result = {
-                'bboxes_3d': pred_instances_3d.bboxes_3d.tensor.cpu().tolist(),
                 'labels_3d': pred_instances_3d.labels_3d.tolist(),
-                'scores_3d': pred_instances_3d.scores_3d.tolist()
+                'scores_3d': pred_instances_3d.scores_3d.tolist(),
+                'bboxes_3d': pred_instances_3d.bboxes_3d.tensor.cpu().tolist()
             }
 
         if 'pred_pts_seg' in data_sample:
@@ -309,4 +361,23 @@ def pred2dict(self, data_sample: InstanceData) -> Dict:
             result['pts_semantic_mask'] = \
                 pred_pts_seg.pts_semantic_mask.tolist()
 
+        if data_sample.box_mode_3d == Box3DMode.LIDAR:
+            result['box_type_3d'] = 'LiDAR'
+        elif data_sample.box_mode_3d == Box3DMode.CAM:
+            result['box_type_3d'] = 'Camera'
+        elif data_sample.box_mode_3d == Box3DMode.DEPTH:
+            result['box_type_3d'] = 'Depth'
+
+        if pred_out_dir != '':
+            if 'lidar_path' in data_sample:
+                lidar_path = osp.basename(data_sample.lidar_path)
+                lidar_path = osp.splitext(lidar_path)[0]
+                out_json_path = osp.join(pred_out_dir, 'preds',
+                                         lidar_path + '.json')
+            else:
+                out_json_path = osp.join(
+                    pred_out_dir, 'preds',
+                    f'{str(self.num_visualized_imgs).zfill(8)}.json')
+            dump(result, out_json_path)
+
         return result
diff --git a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
index a3fdc479b7..e75a8e553e 100644
--- a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
@@ -4,11 +4,14 @@
 
 import mmengine
 import numpy as np
+import torch
 from mmengine.dataset import Compose
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData
 
 from mmdet3d.registry import INFERENCERS
+from mmdet3d.structures import (CameraInstance3DBoxes, DepthInstance3DBoxes,
+                                Det3DDataSample, LiDARInstance3DBoxes)
 from mmdet3d.utils import ConfigType
 from .base_3d_inferencer import Base3DInferencer
 
@@ -43,16 +46,6 @@ class LidarDet3DInferencer(Base3DInferencer):
             priority is palette -> config -> checkpoint. Defaults to 'none'.
     """
 
-    preprocess_kwargs: set = set()
-    forward_kwargs: set = set()
-    visualize_kwargs: set = {
-        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
-    }
-    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
-    }
-
     def __init__(self,
                  model: Union[ModelType, str, None] = None,
                  weights: Optional[str] = None,
@@ -113,9 +106,10 @@ def visualize(self,
                   preds: PredType,
                   return_vis: bool = False,
                   show: bool = False,
-                  wait_time: int = 0,
+                  wait_time: int = -1,
                   draw_pred: bool = True,
                   pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
                   img_out_dir: str = '') -> Union[List[np.ndarray], None]:
         """Visualize predictions.
 
@@ -131,6 +125,8 @@ def visualize(self,
                 Defaults to True.
             pred_score_thr (float): Minimum score of bboxes to draw.
                 Defaults to 0.3.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
             img_out_dir (str): Output directory of visualization results.
                 If left as empty, no file will be saved. Defaults to ''.
 
@@ -138,8 +134,10 @@ def visualize(self,
             List[np.ndarray] or None: Returns visualization results only if
             applicable.
         """
-        if self.visualizer is None or (not show and img_out_dir == ''
-                                       and not return_vis):
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
             return None
 
         if getattr(self, 'visualizer') is None:
@@ -156,17 +154,20 @@ def visualize(self,
                 points = points.reshape(-1, self.load_dim)
                 points = points[:, self.use_dim]
                 pc_name = osp.basename(single_input).split('.bin')[0]
-                pc_name = f'{pc_name}.png'
+                pc_name = f'vis_lidar/{pc_name}.png'
             elif isinstance(single_input, np.ndarray):
                 points = single_input.copy()
                 pc_num = str(self.num_visualized_frames).zfill(8)
-                pc_name = f'pc_{pc_num}.png'
+                pc_name = f'vis_lidar/{pc_num}.png'
             else:
                 raise ValueError('Unsupported input type: '
                                  f'{type(single_input)}')
 
-            o3d_save_path = osp.join(img_out_dir, pc_name) \
-                if img_out_dir != '' else None
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None
 
             data_input = dict(points=points)
             self.visualizer.add_datasample(
@@ -185,3 +186,53 @@ def visualize(self,
             self.num_visualized_frames += 1
 
         return results
+
+    def visualize_preds_fromfile(self, inputs: InputsType, preds: PredType,
+                                 **kwargs) -> Union[List[np.ndarray], None]:
+        """Visualize predictions from `*.json` files.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            preds (PredType): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
+            img_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        data_samples = []
+        for pred in preds:
+            pred = mmengine.load(pred)
+            data_sample = Det3DDataSample()
+            data_sample.pred_instances_3d = InstanceData()
+
+            data_sample.pred_instances_3d.labels_3d = torch.tensor(
+                pred['labels_3d'])
+            data_sample.pred_instances_3d.scores_3d = torch.tensor(
+                pred['scores_3d'])
+            if pred['box_type_3d'] == 'LiDAR':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    LiDARInstance3DBoxes(pred['bboxes_3d'])
+            elif pred['box_type_3d'] == 'Camera':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    CameraInstance3DBoxes(pred['bboxes_3d'])
+            elif pred['box_type_3d'] == 'Depth':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    DepthInstance3DBoxes(pred['bboxes_3d'])
+            else:
+                raise ValueError('Unsupported box type: '
+                                 f'{pred["box_type_3d"]}')
+            data_samples.append(data_sample)
+        return self.visualize(inputs=inputs, preds=data_samples, **kwargs)
diff --git a/mmdet3d/visualization/local_visualizer.py b/mmdet3d/visualization/local_visualizer.py
index a557b3d426..b99636c084 100644
--- a/mmdet3d/visualization/local_visualizer.py
+++ b/mmdet3d/visualization/local_visualizer.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+import logging
 import math
+import os
 import sys
 import time
 from typing import List, Optional, Sequence, Tuple, Union
@@ -176,8 +178,10 @@ def _initialize_o3d_vis(self) -> Visualizer:
         o3d_vis.register_key_action_callback(glfw_key_space,
                                              self.space_action_callback)
         o3d_vis.register_key_callback(glfw_key_right, self.right_callback)
-        o3d_vis.create_window()
-        self.view_control = o3d_vis.get_view_control()
+        if os.environ.get('DISPLAY', None) is not None:
+            o3d_vis.create_window()
+            print_log('No display device found', level=logging.WARNING)
+            self.view_control = o3d_vis.get_view_control()
         return o3d_vis
 
     @master_only

From 5c51f1a10d42c9b82a80d7b79957ad9a881dd505 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Mon, 9 Oct 2023 20:10:21 +0800
Subject: [PATCH 02/11] fix all demo

---
 demo/inference_demo.ipynb                     | 133 +++++++-----------
 demo/new_mono_det_demo.py                     |  98 +++++++++++++
 demo/new_multi_modality_demo.py               | 101 +++++++++++++
 demo/new_pcd_demo.py                          |   8 +-
 demo/new_pcd_seg_demo.py                      |  85 +++++++++++
 mmdet3d/apis/inference.py                     |   3 +-
 .../apis/inferencers/base_3d_inferencer.py    |  55 +++++++-
 .../inferencers/lidar_det3d_inferencer.py     |  10 +-
 .../inferencers/lidar_seg3d_inferencer.py     |  30 ++--
 .../apis/inferencers/mono_det3d_inferencer.py |  28 ++--
 .../multi_modality_det3d_inferencer.py        |  38 +++--
 mmdet3d/datasets/transforms/loading.py        |  53 ++-----
 mmdet3d/visualization/local_visualizer.py     |  17 ++-
 13 files changed, 453 insertions(+), 206 deletions(-)
 create mode 100644 demo/new_mono_det_demo.py
 create mode 100644 demo/new_multi_modality_demo.py
 create mode 100644 demo/new_pcd_seg_demo.py

diff --git a/demo/inference_demo.ipynb b/demo/inference_demo.ipynb
index 0989863ddd..14854722d6 100644
--- a/demo/inference_demo.ipynb
+++ b/demo/inference_demo.ipynb
@@ -2,117 +2,83 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
-   "source": [
-    "from mmdet3d.apis import inference_detector, init_model\n",
-    "from mmdet3d.registry import VISUALIZERS\n",
-    "from mmdet3d.utils import register_all_modules"
-   ],
-   "outputs": [],
+   "execution_count": 25,
    "metadata": {
     "pycharm": {
      "is_executing": false
     }
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "source": [
-    "# register all modules in mmdet3d into the registries\n",
-    "register_all_modules()"
-   ],
+   },
    "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
    "source": [
-    "config_file = '../configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py'\n",
-    "# download the checkpoint from model zoo and put it in `checkpoints/`\n",
-    "checkpoint_file = '../work_dirs/second/epoch_40.pth'"
-   ],
-   "outputs": [],
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   }
+    "from mmdet3d.apis import LidarDet3DInferencer"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "source": [
-    "# build the model from a config file and a checkpoint file\n",
-    "model = init_model(config_file, checkpoint_file, device='cuda:0')"
-   ],
+   "metadata": {},
    "outputs": [],
-   "metadata": {}
+   "source": [
+    "# initialize inferencer\n",
+    "inferencer = LidarDet3DInferencer('pointpillars_kitti-3class')"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "source": [
-    "# init visualizer\n",
-    "visualizer = VISUALIZERS.build(model.cfg.visualizer)\n",
-    "visualizer.dataset_meta = {\n",
-    "    'CLASSES': model.CLASSES,\n",
-    "    'PALETTE': model.PALETTE\n",
-    "}"
-   ],
-   "outputs": [],
    "metadata": {
     "pycharm": {
      "is_executing": false
     }
-   }
+   },
+   "outputs": [],
+   "source": [
+    "# inference\n",
+    "inputs = dict(points='./data/kitti/000008.bin')\n",
+    "inferencer(inputs)"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "source": [
-    "# test a single sample\n",
-    "pcd = './data/kitti/000008.bin'\n",
-    "result, data = inference_detector(model, pcd)\n",
-    "points = data['inputs']['points']\n",
-    "data_input = dict(points=points)"
-   ],
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   }
+   "source": [
+    "# inference and visualize\n",
+    "# NOTE: use the `Esc` key to exit Open3D window in Jupyter Notebook Environment\n",
+    "inferencer(inputs, show=True)"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "source": [
-    "# show the results\n",
-    "out_dir = './'\n",
-    "visualizer.add_datasample(\n",
-    "    'result',\n",
-    "    data_input,\n",
-    "    data_sample=result,\n",
-    "    draw_gt=False,\n",
-    "    show=True,\n",
-    "    wait_time=0,\n",
-    "    out_file=out_dir,\n",
-    "    vis_task='det')"
-   ],
+   "metadata": {},
    "outputs": [],
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   }
+   "source": [
+    "# If your operating environment does not have a display device,\n",
+    "# (e.g. a remote server), you can save the predictions and visualize\n",
+    "# them in local devices.\n",
+    "inferencer(inputs, show=False, out_dir='./remote_outputs')\n",
+    "\n",
+    "# Simulate the migration process\n",
+    "%mv ./remote_outputs ./local_outputs\n",
+    "\n",
+    "# Visualize the predictions from the saved files\n",
+    "# NOTE: use the `Esc` key to exit Open3D window in Jupyter Notebook Environment\n",
+    "local_inferencer = LidarDet3DInferencer('pointpillars_kitti-3class')\n",
+    "inputs = local_inferencer._inputs_to_list(inputs)\n",
+    "local_inferencer.visualize_preds_fromfile(inputs, ['local_outputs/preds/000008.json'], show=True)"
+   ]
   }
  ],
  "metadata": {
+  "interpreter": {
+   "hash": "a0c343fece975dd89087e8c2194dd4d3db28d7000f1b32ed9ed9d584dd54dbbe"
+  },
   "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.7.6 64-bit ('torch1.7-cu10.1': conda)"
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -124,19 +90,16 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.9.16"
   },
   "pycharm": {
    "stem_cell": {
     "cell_type": "raw",
-    "source": [],
     "metadata": {
      "collapsed": false
-    }
+    },
+    "source": []
    }
-  },
-  "interpreter": {
-   "hash": "a0c343fece975dd89087e8c2194dd4d3db28d7000f1b32ed9ed9d584dd54dbbe"
   }
  },
  "nbformat": 4,
diff --git a/demo/new_mono_det_demo.py b/demo/new_mono_det_demo.py
new file mode 100644
index 0000000000..08ba916a8f
--- /dev/null
+++ b/demo/new_mono_det_demo.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import MonoDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_BACK',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait_time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        img=call_args.pop('img'), infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = MonoDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/new_multi_modality_demo.py b/demo/new_multi_modality_demo.py
new file mode 100644
index 0000000000..3b4605aeeb
--- /dev/null
+++ b/demo/new_multi_modality_demo.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import MultiModalityDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_BACK',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait_time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        points=call_args.pop('pcd'),
+        img=call_args.pop('img'),
+        infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = MultiModalityDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/new_pcd_demo.py b/demo/new_pcd_demo.py
index 9dd80979c7..d7d850bc12 100644
--- a/demo/new_pcd_demo.py
+++ b/demo/new_pcd_demo.py
@@ -77,13 +77,7 @@ def main():
     init_args, call_args = parse_args()
 
     inferencer = LidarDet3DInferencer(**init_args)
-    # inferencer(**call_args)
-
-    inputs = call_args.pop('inputs')
-    _, _, visualize_kwargs, _ = inferencer._dispatch_kwargs(**call_args)
-    inputs = inferencer._inputs_to_list(inputs)
-    inferencer.visualize_preds_fromfile(inputs, ['outputs/preds/000008.json'],
-                                        **visualize_kwargs)
+    inferencer(**call_args)
 
     if call_args['out_dir'] != '' and not (call_args['no_save_vis']
                                            and call_args['no_save_pred']):
diff --git a/demo/new_pcd_seg_demo.py b/demo/new_pcd_seg_demo.py
new file mode 100644
index 0000000000..bc6697e165
--- /dev/null
+++ b/demo/new_pcd_seg_demo.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarSeg3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait_time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = LidarSeg3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmdet3d/apis/inference.py b/mmdet3d/apis/inference.py
index e3fa9d5e3a..e355b958ee 100644
--- a/mmdet3d/apis/inference.py
+++ b/mmdet3d/apis/inference.py
@@ -392,7 +392,8 @@ def inference_segmentor(model: nn.Module, pcds: PointsType):
 
     new_test_pipeline = []
     for pipeline in test_pipeline:
-        if pipeline['type'] != 'LoadAnnotations3D':
+        if pipeline['type'] != 'LoadAnnotations3D' and pipeline[
+                'type'] != 'PointSegClassMapping':
             new_test_pipeline.append(pipeline)
     test_pipeline = Compose(new_test_pipeline)
 
diff --git a/mmdet3d/apis/inferencers/base_3d_inferencer.py b/mmdet3d/apis/inferencers/base_3d_inferencer.py
index 26004512a3..8996bb0bab 100644
--- a/mmdet3d/apis/inferencers/base_3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/base_3d_inferencer.py
@@ -4,6 +4,7 @@
 from copy import deepcopy
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
+import mmengine
 import numpy as np
 import torch.nn as nn
 from mmengine import dump, print_log
@@ -50,11 +51,11 @@ class Base3DInferencer(BaseInferencer):
             priority is palette -> config -> checkpoint. Defaults to 'none'.
     """
 
-    preprocess_kwargs: set = set()
+    preprocess_kwargs: set = {'cam_type'}
     forward_kwargs: set = set()
     visualize_kwargs: set = {
         'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir', 'no_save_vis'
+        'img_out_dir', 'no_save_vis', 'cam_type_dir'
     }
     postprocess_kwargs: set = {
         'print_result', 'pred_out_dir', 'return_datasample', 'no_save_pred'
@@ -131,10 +132,10 @@ def _init_model(
         model.eval()
         return model
 
-    def _inputs_to_list(
-            self,
-            inputs: Union[dict, list],
-            modality_key: Union[str, List[str]] = 'points') -> list:
+    def _inputs_to_list(self,
+                        inputs: Union[dict, list],
+                        modality_key: Union[str, List[str]] = 'points',
+                        cam_type='CAM2') -> list:
         """Preprocess the inputs to a list.
 
         Preprocess inputs to a list according to its type:
@@ -158,6 +159,11 @@ def _inputs_to_list(
             modality_key = [modality_key]
         assert set(modality_key).issubset({'points', 'img'})
 
+        if 'infos' in inputs:
+            infos = inputs.pop('infos')
+        else:
+            infos = None
+
         for key in modality_key:
             if isinstance(inputs, dict) and isinstance(inputs[key], str):
                 img = inputs[key]
@@ -174,6 +180,31 @@ def _inputs_to_list(
         if not isinstance(inputs, (list, tuple)):
             inputs = [inputs]
 
+        # get cam2img, lidar2cam and lidar2img from infos
+        if infos is not None:
+            info_list = mmengine.load(infos)['data_list']
+            assert len(info_list) == len(inputs)
+            for index, input in enumerate(inputs):
+                data_info = info_list[index]
+                img_path = data_info['images'][cam_type]['img_path']
+                if osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+
         return list(inputs)
 
     def _get_transform_idx(self, pipeline_cfg: ConfigType, name: str) -> int:
@@ -193,12 +224,14 @@ def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]:
 
     def _dispatch_kwargs(self,
                          out_dir: str = '',
+                         cam_type: str = '',
                          **kwargs) -> Tuple[Dict, Dict, Dict, Dict]:
         """Dispatch kwargs to preprocess(), forward(), visualize() and
         postprocess() according to the actual demands.
 
         Args:
             out_dir (str): Dir to save the inference results.
+            cam_type (str): Camera type. Defaults to 'CAM2'.
             **kwargs (dict): Key words arguments passed to :meth:`preprocess`,
                 :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
                 Each key in kwargs should be in the corresponding set of
@@ -211,6 +244,8 @@ def _dispatch_kwargs(self,
         """
         kwargs['img_out_dir'] = out_dir
         kwargs['pred_out_dir'] = out_dir
+        if cam_type != '':
+            kwargs['cam_type_dir'] = cam_type
         return super()._dispatch_kwargs(**kwargs)
 
     def __call__(self,
@@ -243,7 +278,8 @@ def __call__(self,
             postprocess_kwargs,
         ) = self._dispatch_kwargs(**kwargs)
 
-        ori_inputs = self._inputs_to_list(inputs)
+        cam_type = preprocess_kwargs.pop('cam_type', 'CAM2')
+        ori_inputs = self._inputs_to_list(inputs, cam_type=cam_type)
         inputs = self.preprocess(
             ori_inputs, batch_size=batch_size, **preprocess_kwargs)
         preds = []
@@ -374,6 +410,11 @@ def pred2dict(self,
                 lidar_path = osp.splitext(lidar_path)[0]
                 out_json_path = osp.join(pred_out_dir, 'preds',
                                          lidar_path + '.json')
+            elif 'img_path' in data_sample:
+                img_path = osp.basename(data_sample.img_path)
+                img_path = osp.splitext(img_path)[0]
+                out_json_path = osp.join(pred_out_dir, 'preds',
+                                         img_path + '.json')
             else:
                 out_json_path = osp.join(
                     pred_out_dir, 'preds',
diff --git a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
index e75a8e553e..128c53fb81 100644
--- a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
@@ -62,7 +62,7 @@ def __init__(self,
             scope=scope,
             palette=palette)
 
-    def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
+    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
         """Preprocess the inputs to a list.
 
         Preprocess inputs to a list according to its type:
@@ -80,7 +80,7 @@ def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
         Returns:
             list: List of input for the :meth:`preprocess`.
         """
-        return super()._inputs_to_list(inputs, modality_key='points')
+        return super()._inputs_to_list(inputs, modality_key='points', **kwargs)
 
     def _init_pipeline(self, cfg: ConfigType) -> Compose:
         """Initialize the test pipeline."""
@@ -154,17 +154,17 @@ def visualize(self,
                 points = points.reshape(-1, self.load_dim)
                 points = points[:, self.use_dim]
                 pc_name = osp.basename(single_input).split('.bin')[0]
-                pc_name = f'vis_lidar/{pc_name}.png'
+                pc_name = f'{pc_name}.png'
             elif isinstance(single_input, np.ndarray):
                 points = single_input.copy()
                 pc_num = str(self.num_visualized_frames).zfill(8)
-                pc_name = f'vis_lidar/{pc_num}.png'
+                pc_name = f'{pc_num}.png'
             else:
                 raise ValueError('Unsupported input type: '
                                  f'{type(single_input)}')
 
             if img_out_dir != '' and show:
-                o3d_save_path = osp.join(img_out_dir, pc_name)
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
                 mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
             else:
                 o3d_save_path = None
diff --git a/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py b/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
index 0286a07113..2adb998e4a 100644
--- a/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
@@ -43,16 +43,6 @@ class LidarSeg3DInferencer(Base3DInferencer):
             priority is palette -> config -> checkpoint. Defaults to 'none'.
     """
 
-    preprocess_kwargs: set = set()
-    forward_kwargs: set = set()
-    visualize_kwargs: set = {
-        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
-    }
-    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
-    }
-
     def __init__(self,
                  model: Union[ModelType, str, None] = None,
                  weights: Optional[str] = None,
@@ -69,7 +59,7 @@ def __init__(self,
             scope=scope,
             palette=palette)
 
-    def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
+    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
         """Preprocess the inputs to a list.
 
         Preprocess inputs to a list according to its type:
@@ -87,7 +77,7 @@ def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
         Returns:
             list: List of input for the :meth:`preprocess`.
         """
-        return super()._inputs_to_list(inputs, modality_key='points')
+        return super()._inputs_to_list(inputs, modality_key='points', **kwargs)
 
     def _init_pipeline(self, cfg: ConfigType) -> Compose:
         """Initialize the test pipeline."""
@@ -124,6 +114,7 @@ def visualize(self,
                   wait_time: int = 0,
                   draw_pred: bool = True,
                   pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
                   img_out_dir: str = '') -> Union[List[np.ndarray], None]:
         """Visualize predictions.
 
@@ -146,8 +137,10 @@ def visualize(self,
             List[np.ndarray] or None: Returns visualization results only if
             applicable.
         """
-        if self.visualizer is None or (not show and img_out_dir == ''
-                                       and not return_vis):
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
             return None
 
         if getattr(self, 'visualizer') is None:
@@ -168,13 +161,16 @@ def visualize(self,
             elif isinstance(single_input, np.ndarray):
                 points = single_input.copy()
                 pc_num = str(self.num_visualized_frames).zfill(8)
-                pc_name = f'pc_{pc_num}.png'
+                pc_name = f'{pc_num}.png'
             else:
                 raise ValueError('Unsupported input type: '
                                  f'{type(single_input)}')
 
-            o3d_save_path = osp.join(img_out_dir, pc_name) \
-                if img_out_dir != '' else None
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None
 
             data_input = dict(points=points)
             self.visualizer.add_datasample(
diff --git a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
index a0ebfe57fc..d4e231808b 100644
--- a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
@@ -44,16 +44,6 @@ class MonoDet3DInferencer(Base3DInferencer):
             priority is palette -> config -> checkpoint. Defaults to 'none'.
     """
 
-    preprocess_kwargs: set = set()
-    forward_kwargs: set = set()
-    visualize_kwargs: set = {
-        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
-    }
-    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
-    }
-
     def __init__(self,
                  model: Union[ModelType, str, None] = None,
                  weights: Optional[str] = None,
@@ -70,7 +60,7 @@ def __init__(self,
             scope=scope,
             palette=palette)
 
-    def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
+    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
         """Preprocess the inputs to a list.
 
         Preprocess inputs to a list according to its type:
@@ -88,7 +78,7 @@ def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
         Returns:
             list: List of input for the :meth:`preprocess`.
         """
-        return super()._inputs_to_list(inputs, modality_key='img')
+        return super()._inputs_to_list(inputs, modality_key='img', **kwargs)
 
     def _init_pipeline(self, cfg: ConfigType) -> Compose:
         """Initialize the test pipeline."""
@@ -110,7 +100,9 @@ def visualize(self,
                   wait_time: int = 0,
                   draw_pred: bool = True,
                   pred_score_thr: float = 0.3,
-                  img_out_dir: str = '') -> Union[List[np.ndarray], None]:
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  cam_type_dir: str = 'CAM2') -> Union[List[np.ndarray], None]:
         """Visualize predictions.
 
         Args:
@@ -132,8 +124,10 @@ def visualize(self,
             List[np.ndarray] or None: Returns visualization results only if
             applicable.
         """
-        if self.visualizer is None or (not show and img_out_dir == ''
-                                       and not return_vis):
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
             return None
 
         if getattr(self, 'visualizer') is None:
@@ -156,8 +150,8 @@ def visualize(self,
                 raise ValueError('Unsupported input type: '
                                  f"{type(single_input['img'])}")
 
-            out_file = osp.join(img_out_dir, img_name) if img_out_dir != '' \
-                else None
+            out_file = osp.join(img_out_dir, 'vis_camera', cam_type_dir,
+                                img_name) if img_out_dir != '' else None
 
             data_input = dict(img=img)
             self.visualizer.add_datasample(
diff --git a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
index ab02e064b8..7f00dd03ff 100644
--- a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
@@ -44,16 +44,6 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
         palette (str): The palette of visualization. Defaults to 'none'.
     """
 
-    preprocess_kwargs: set = set()
-    forward_kwargs: set = set()
-    visualize_kwargs: set = {
-        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
-    }
-    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
-    }
-
     def __init__(self,
                  model: Union[ModelType, str, None] = None,
                  weights: Optional[str] = None,
@@ -70,7 +60,7 @@ def __init__(self,
             scope=scope,
             palette=palette)
 
-    def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
+    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
         """Preprocess the inputs to a list.
 
         Preprocess inputs to a list according to its type:
@@ -88,7 +78,8 @@ def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
         Returns:
             list: List of input for the :meth:`preprocess`.
         """
-        return super()._inputs_to_list(inputs, modality_key=['points', 'img'])
+        return super()._inputs_to_list(
+            inputs, modality_key=['points', 'img'], **kwargs)
 
     def _init_pipeline(self, cfg: ConfigType) -> Compose:
         """Initialize the test pipeline."""
@@ -144,7 +135,9 @@ def visualize(self,
                   wait_time: int = 0,
                   draw_pred: bool = True,
                   pred_score_thr: float = 0.3,
-                  img_out_dir: str = '') -> Union[List[np.ndarray], None]:
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  cam_type_dir: str = 'CAM2') -> Union[List[np.ndarray], None]:
         """Visualize predictions.
 
         Args:
@@ -166,8 +159,10 @@ def visualize(self,
             List[np.ndarray] or None: Returns visualization results only if
             applicable.
         """
-        if self.visualizer is None or (not show and img_out_dir == ''
-                                       and not return_vis):
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
             return None
 
         if getattr(self, 'visualizer') is None:
@@ -188,13 +183,16 @@ def visualize(self,
             elif isinstance(points_input, np.ndarray):
                 points = points_input.copy()
                 pc_num = str(self.num_visualized_frames).zfill(8)
-                pc_name = f'pc_{pc_num}.png'
+                pc_name = f'{pc_num}.png'
             else:
                 raise ValueError('Unsupported input type: '
                                  f'{type(points_input)}')
 
-            o3d_save_path = osp.join(img_out_dir, pc_name) \
-                if img_out_dir != '' else None
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None
 
             img_input = single_input['img']
             if isinstance(single_input['img'], str):
@@ -210,8 +208,8 @@ def visualize(self,
                 raise ValueError('Unsupported input type: '
                                  f'{type(img_input)}')
 
-            out_file = osp.join(img_out_dir, img_name) if img_out_dir != '' \
-                else None
+            out_file = osp.join(img_out_dir, 'vis_camera', cam_type_dir,
+                                img_name) if img_out_dir != '' else None
 
             data_input = dict(points=points, img=img)
             self.visualizer.add_datasample(
diff --git a/mmdet3d/datasets/transforms/loading.py b/mmdet3d/datasets/transforms/loading.py
index 1dbad93001..383c44536f 100644
--- a/mmdet3d/datasets/transforms/loading.py
+++ b/mmdet3d/datasets/transforms/loading.py
@@ -1153,7 +1153,6 @@ class MonoDet3DInferencerLoader(BaseTransform):
 
     Added keys:
       - img
-      - cam2img
       - box_type_3d
       - box_mode_3d
 
@@ -1176,32 +1175,19 @@ def transform(self, single_input: dict) -> dict:
             dict: The dict contains loaded image and meta information.
         """
         box_type_3d, box_mode_3d = get_box_type('camera')
-        assert 'calib' in single_input and 'img' in single_input, \
-            "key 'calib' and 'img' must be in input dict"
-        if isinstance(single_input['calib'], str):
-            calib_path = single_input['calib']
-            with open(calib_path, 'r') as f:
-                lines = f.readlines()
-            cam2img = np.array([
-                float(info) for info in lines[0].split(' ')[0:16]
-            ]).reshape([4, 4])
-        elif isinstance(single_input['calib'], np.ndarray):
-            cam2img = single_input['calib']
-        else:
-            raise ValueError('Unsupported input calib type: '
-                             f"{type(single_input['calib'])}")
 
         if isinstance(single_input['img'], str):
             inputs = dict(
                 images=dict(
                     CAM_FRONT=dict(
-                        img_path=single_input['img'], cam2img=cam2img)),
+                        img_path=single_input['img'],
+                        cam2img=single_input['cam2img'])),
                 box_mode_3d=box_mode_3d,
                 box_type_3d=box_type_3d)
         elif isinstance(single_input['img'], np.ndarray):
             inputs = dict(
                 img=single_input['img'],
-                cam2img=cam2img,
+                cam2img=single_input['cam2img'],
                 box_type_3d=box_type_3d,
                 box_mode_3d=box_mode_3d)
         else:
@@ -1252,9 +1238,9 @@ def transform(self, single_input: dict) -> dict:
             dict: The dict contains loaded image, point cloud and meta
             information.
         """
-        assert 'points' in single_input and 'img' in single_input and \
-            'calib' in single_input, "key 'points', 'img' and 'calib' must be "
-        f'in input dict, but got {single_input}'
+        assert 'points' in single_input and 'img' in single_input, \
+            "key 'points', 'img' and must be in input dict," \
+            f'but got {single_input}'
         if isinstance(single_input['points'], str):
             inputs = dict(
                 lidar_points=dict(lidar_path=single_input['points']),
@@ -1283,36 +1269,21 @@ def transform(self, single_input: dict) -> dict:
         multi_modality_inputs = points_inputs
 
         box_type_3d, box_mode_3d = get_box_type('lidar')
-        if isinstance(single_input['calib'], str):
-            calib = mmengine.load(single_input['calib'])
-
-        elif isinstance(single_input['calib'], dict):
-            calib = single_input['calib']
-        else:
-            raise ValueError('Unsupported input calib type: '
-                             f"{type(single_input['calib'])}")
-
-        cam2img = np.asarray(calib['cam2img'], dtype=np.float32)
-        lidar2cam = np.asarray(calib['lidar2cam'], dtype=np.float32)
-        if 'lidar2cam' in calib:
-            lidar2img = np.asarray(calib['lidar2img'], dtype=np.float32)
-        else:
-            lidar2img = cam2img @ lidar2cam
 
         if isinstance(single_input['img'], str):
             inputs = dict(
                 img_path=single_input['img'],
-                cam2img=cam2img,
-                lidar2img=lidar2img,
-                lidar2cam=lidar2cam,
+                cam2img=single_input['cam2img'],
+                lidar2img=single_input['lidar2img'],
+                lidar2cam=single_input['lidar2cam'],
                 box_mode_3d=box_mode_3d,
                 box_type_3d=box_type_3d)
         elif isinstance(single_input['img'], np.ndarray):
             inputs = dict(
                 img=single_input['img'],
-                cam2img=cam2img,
-                lidar2img=lidar2img,
-                lidar2cam=lidar2cam,
+                cam2img=single_input['cam2img'],
+                lidar2img=single_input['lidar2img'],
+                lidar2cam=single_input['lidar2cam'],
                 box_type_3d=box_type_3d,
                 box_mode_3d=box_mode_3d)
         else:
diff --git a/mmdet3d/visualization/local_visualizer.py b/mmdet3d/visualization/local_visualizer.py
index b99636c084..b8bb2cdd71 100644
--- a/mmdet3d/visualization/local_visualizer.py
+++ b/mmdet3d/visualization/local_visualizer.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
-import logging
 import math
 import os
 import sys
@@ -157,7 +156,7 @@ def _clear_o3d_vis(self) -> None:
             if hasattr(self, 'pcd'):
                 del self.pcd
 
-    def _initialize_o3d_vis(self) -> Visualizer:
+    def _initialize_o3d_vis(self, show=True) -> Visualizer:
         """Initialize open3d vis according to frame_cfg.
 
         Args:
@@ -178,9 +177,8 @@ def _initialize_o3d_vis(self) -> Visualizer:
         o3d_vis.register_key_action_callback(glfw_key_space,
                                              self.space_action_callback)
         o3d_vis.register_key_callback(glfw_key_right, self.right_callback)
-        if os.environ.get('DISPLAY', None) is not None:
+        if os.environ.get('DISPLAY', None) is not None and show:
             o3d_vis.create_window()
-            print_log('No display device found', level=logging.WARNING)
             self.view_control = o3d_vis.get_view_control()
         return o3d_vis
 
@@ -863,6 +861,9 @@ def show(self,
                     self.view_port)
             self.flag_exit = not self.o3d_vis.poll_events()
             self.o3d_vis.update_renderer()
+            # if not hasattr(self, 'view_control'):
+            #     self.o3d_vis.create_window()
+            #     self.view_control = self.o3d_vis.get_view_control()
             self.view_port = \
                 self.view_control.convert_to_pinhole_camera_parameters()  # noqa: E501
             if wait_time != -1:
@@ -980,7 +981,7 @@ def add_datasample(self,
         # For object detection datasets, no palette is saved
         palette = self.dataset_meta.get('palette', None)
         ignore_index = self.dataset_meta.get('ignore_index', None)
-        if ignore_index is not None and 'gt_pts_seg' in data_sample and vis_task == 'lidar_seg':  # noqa: E501
+        if vis_task == 'lidar_seg' and ignore_index is not None and 'pts_semantic_mask' in data_sample.gt_pts_seg:  # noqa: E501
             keep_index = data_sample.gt_pts_seg.pts_semantic_mask != ignore_index  # noqa: E501
         else:
             keep_index = None
@@ -990,6 +991,9 @@ def add_datasample(self,
         gt_img_data = None
         pred_img_data = None
 
+        if not hasattr(self, 'o3d_vis'):
+            self.o3d_vis = self._initialize_o3d_vis(show=show)
+
         if draw_gt and data_sample is not None:
             if 'gt_instances_3d' in data_sample:
                 gt_data_3d = self._draw_instances_3d(
@@ -1087,6 +1091,7 @@ def add_datasample(self,
             if drawn_img_3d is not None:
                 mmcv.imwrite(drawn_img_3d[..., ::-1], out_file)
             if drawn_img is not None:
-                mmcv.imwrite(drawn_img[..., ::-1], out_file)
+                mmcv.imwrite(drawn_img[..., ::-1],
+                             out_file[:-4] + '_2d' + out_file[-4:])
         else:
             self.add_image(name, drawn_img_3d, step)

From 510f50e119d622510fc78cad3126afabf7496aab Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 11 Oct 2023 15:21:50 +0800
Subject: [PATCH 03/11] fix bevfusion demo

---
 projects/BEVFusion/README.md                  |  2 +-
 projects/BEVFusion/demo/multi_modalit_demo.py | 78 +++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 projects/BEVFusion/demo/multi_modalit_demo.py

diff --git a/projects/BEVFusion/README.md b/projects/BEVFusion/README.md
index 0828bd4f8d..9d5ebd4c52 100644
--- a/projects/BEVFusion/README.md
+++ b/projects/BEVFusion/README.md
@@ -34,7 +34,7 @@ python projects/BEVFusion/setup.py develop
 Run a demo on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link):
 
 ```shell
-python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+python projects/BEVFusion/demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
 ```
 
 ### Training commands
diff --git a/projects/BEVFusion/demo/multi_modalit_demo.py b/projects/BEVFusion/demo/multi_modalit_demo.py
new file mode 100644
index 0000000000..9f82867d40
--- /dev/null
+++ b/projects/BEVFusion/demo/multi_modalit_demo.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+import mmcv
+
+from mmdet3d.apis import inference_multi_modality_detector, init_model
+from mmdet3d.registry import VISUALIZERS
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('img', help='image file')
+    parser.add_argument('ann', help='ann file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_FRONT',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.0, help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir', type=str, default='demo', help='dir to save results')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='show online visualization results')
+    parser.add_argument(
+        '--snapshot',
+        action='store_true',
+        help='whether to save online visualization results')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint, device=args.device)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    # test a single image and point cloud sample
+    result, data = inference_multi_modality_detector(model, args.pcd, args.img,
+                                                     args.ann, args.cam_type)
+    points = data['inputs']['points']
+    if isinstance(result.img_path, list):
+        img = []
+        for img_path in result.img_path:
+            single_img = mmcv.imread(img_path)
+            single_img = mmcv.imconvert(single_img, 'bgr', 'rgb')
+            img.append(single_img)
+    else:
+        img = mmcv.imread(result.img_path)
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
+    data_input = dict(points=points, img=img)
+
+    # show the results
+    visualizer.add_datasample(
+        'result',
+        data_input,
+        data_sample=result,
+        draw_gt=False,
+        show=args.show,
+        wait_time=-1,
+        out_file=args.out_dir,
+        pred_score_thr=args.score_thr,
+        vis_task='multi-modality_det')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)

From b72da9b1a6b2698d7bebea1a7d8a635d76987e34 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Mon, 16 Oct 2023 11:42:23 +0800
Subject: [PATCH 04/11] rename

---
 demo/mono_det_demo.py           | 106 +++++++++++++++++-----------
 demo/multi_modality_demo.py     | 119 +++++++++++++++++++-------------
 demo/new_mono_det_demo.py       |  98 --------------------------
 demo/new_multi_modality_demo.py | 101 ---------------------------
 demo/new_pcd_demo.py            |  90 ------------------------
 demo/new_pcd_seg_demo.py        |  85 -----------------------
 demo/pcd_demo.py                | 104 ++++++++++++++++++----------
 demo/pcd_seg_demo.py            | 102 +++++++++++++++++----------
 8 files changed, 271 insertions(+), 534 deletions(-)
 delete mode 100644 demo/new_mono_det_demo.py
 delete mode 100644 demo/new_multi_modality_demo.py
 delete mode 100644 demo/new_pcd_demo.py
 delete mode 100644 demo/new_pcd_seg_demo.py

diff --git a/demo/mono_det_demo.py b/demo/mono_det_demo.py
index 0096a649e5..08ba916a8f 100644
--- a/demo/mono_det_demo.py
+++ b/demo/mono_det_demo.py
@@ -1,18 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
 from argparse import ArgumentParser
 
-import mmcv
+from mmengine.logging import print_log
 
-from mmdet3d.apis import inference_mono_3d_detector, init_model
-from mmdet3d.registry import VISUALIZERS
+from mmdet3d.apis import MonoDet3DInferencer
 
 
 def parse_args():
     parser = ArgumentParser()
-    parser.add_argument('img', help='image file')
-    parser.add_argument('ann', help='ann file')
-    parser.add_argument('config', help='Config file')
-    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
     parser.add_argument(
         '--device', default='cuda:0', help='Device used for inference')
     parser.add_argument(
@@ -21,50 +22,77 @@ def parse_args():
         default='CAM_BACK',
         help='choose camera type to inference')
     parser.add_argument(
-        '--score-thr', type=float, default=0.30, help='bbox score threshold')
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
     parser.add_argument(
-        '--out-dir', type=str, default='demo', help='dir to save results')
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction results.')
     parser.add_argument(
         '--show',
         action='store_true',
-        help='show online visualization results')
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait_time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
     parser.add_argument(
-        '--snapshot',
+        '--no-save-vis',
         action='store_true',
-        help='whether to save online visualization results')
-    args = parser.parse_args()
-    return args
+        help='Do not save detection vis results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        img=call_args.pop('img'), infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
 
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
 
-def main(args):
-    # build the model from a config file and a checkpoint file
-    model = init_model(args.config, args.checkpoint, device=args.device)
+    return init_args, call_args
 
-    # init visualizer
-    visualizer = VISUALIZERS.build(model.cfg.visualizer)
-    visualizer.dataset_meta = model.dataset_meta
 
-    # test a single image
-    result = inference_mono_3d_detector(model, args.img, args.ann,
-                                        args.cam_type)
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
 
-    img = mmcv.imread(args.img)
-    img = mmcv.imconvert(img, 'bgr', 'rgb')
+    inferencer = MonoDet3DInferencer(**init_args)
+    inferencer(**call_args)
 
-    data_input = dict(img=img)
-    # show the results
-    visualizer.add_datasample(
-        'result',
-        data_input,
-        data_sample=result,
-        draw_gt=False,
-        show=args.show,
-        wait_time=-1,
-        out_file=args.out_dir,
-        pred_score_thr=args.score_thr,
-        vis_task='mono_det')
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
 
 
 if __name__ == '__main__':
-    args = parse_args()
-    main(args)
+    main()
diff --git a/demo/multi_modality_demo.py b/demo/multi_modality_demo.py
index 9f82867d40..3b4605aeeb 100644
--- a/demo/multi_modality_demo.py
+++ b/demo/multi_modality_demo.py
@@ -1,78 +1,101 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
 from argparse import ArgumentParser
 
-import mmcv
+from mmengine.logging import print_log
 
-from mmdet3d.apis import inference_multi_modality_detector, init_model
-from mmdet3d.registry import VISUALIZERS
+from mmdet3d.apis import MultiModalityDet3DInferencer
 
 
 def parse_args():
     parser = ArgumentParser()
     parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('img', help='image file')
-    parser.add_argument('ann', help='ann file')
-    parser.add_argument('config', help='Config file')
-    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
     parser.add_argument(
         '--device', default='cuda:0', help='Device used for inference')
     parser.add_argument(
         '--cam-type',
         type=str,
-        default='CAM_FRONT',
+        default='CAM_BACK',
         help='choose camera type to inference')
     parser.add_argument(
-        '--score-thr', type=float, default=0.0, help='bbox score threshold')
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
     parser.add_argument(
-        '--out-dir', type=str, default='demo', help='dir to save results')
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction results.')
     parser.add_argument(
         '--show',
         action='store_true',
-        help='show online visualization results')
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait_time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection json results')
     parser.add_argument(
-        '--snapshot',
+        '--print-result',
         action='store_true',
-        help='whether to save online visualization results')
-    args = parser.parse_args()
-    return args
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        points=call_args.pop('pcd'),
+        img=call_args.pop('img'),
+        infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
 
+    return init_args, call_args
 
-def main(args):
-    # build the model from a config file and a checkpoint file
-    model = init_model(args.config, args.checkpoint, device=args.device)
 
-    # init visualizer
-    visualizer = VISUALIZERS.build(model.cfg.visualizer)
-    visualizer.dataset_meta = model.dataset_meta
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
 
-    # test a single image and point cloud sample
-    result, data = inference_multi_modality_detector(model, args.pcd, args.img,
-                                                     args.ann, args.cam_type)
-    points = data['inputs']['points']
-    if isinstance(result.img_path, list):
-        img = []
-        for img_path in result.img_path:
-            single_img = mmcv.imread(img_path)
-            single_img = mmcv.imconvert(single_img, 'bgr', 'rgb')
-            img.append(single_img)
-    else:
-        img = mmcv.imread(result.img_path)
-        img = mmcv.imconvert(img, 'bgr', 'rgb')
-    data_input = dict(points=points, img=img)
+    inferencer = MultiModalityDet3DInferencer(**init_args)
+    inferencer(**call_args)
 
-    # show the results
-    visualizer.add_datasample(
-        'result',
-        data_input,
-        data_sample=result,
-        draw_gt=False,
-        show=args.show,
-        wait_time=-1,
-        out_file=args.out_dir,
-        pred_score_thr=args.score_thr,
-        vis_task='multi-modality_det')
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
 
 
 if __name__ == '__main__':
-    args = parse_args()
-    main(args)
+    main()
diff --git a/demo/new_mono_det_demo.py b/demo/new_mono_det_demo.py
deleted file mode 100644
index 08ba916a8f..0000000000
--- a/demo/new_mono_det_demo.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import os
-from argparse import ArgumentParser
-
-from mmengine.logging import print_log
-
-from mmdet3d.apis import MonoDet3DInferencer
-
-
-def parse_args():
-    parser = ArgumentParser()
-    parser.add_argument('img', help='Image file')
-    parser.add_argument('infos', help='Infos file with annotations')
-    parser.add_argument('model', help='Config file')
-    parser.add_argument('weights', help='Checkpoint file')
-    parser.add_argument(
-        '--device', default='cuda:0', help='Device used for inference')
-    parser.add_argument(
-        '--cam-type',
-        type=str,
-        default='CAM_BACK',
-        help='choose camera type to inference')
-    parser.add_argument(
-        '--pred-score-thr',
-        type=float,
-        default=0.3,
-        help='bbox score threshold')
-    parser.add_argument(
-        '--out-dir',
-        type=str,
-        default='outputs',
-        help='Output directory of prediction results.')
-    parser.add_argument(
-        '--show',
-        action='store_true',
-        help='Show online visualization results')
-    parser.add_argument(
-        '--wait_time',
-        type=float,
-        default=-1,
-        help='The interval of show (s). Demo will be blocked in showing'
-        'results, if wait_time is -1. Defaults to -1.')
-    parser.add_argument(
-        '--no-save-vis',
-        action='store_true',
-        help='Do not save detection vis results')
-    parser.add_argument(
-        '--no-save-pred',
-        action='store_true',
-        help='Do not save detection json results')
-    parser.add_argument(
-        '--print-result',
-        action='store_true',
-        help='Whether to print the results.')
-    call_args = vars(parser.parse_args())
-
-    call_args['inputs'] = dict(
-        img=call_args.pop('img'), infos=call_args.pop('infos'))
-    call_args.pop('cam_type')
-
-    if call_args['no_save_vis'] and call_args['no_save_pred']:
-        call_args['out_dir'] = ''
-
-    init_kws = ['model', 'weights', 'device']
-    init_args = {}
-    for init_kw in init_kws:
-        init_args[init_kw] = call_args.pop(init_kw)
-
-    # NOTE: If your operating environment does not have a display device,
-    # (e.g. a remote server), you can save the predictions and visualize
-    # them in local devices.
-    if os.environ.get('DISPLAY') is None and call_args['show']:
-        print_log(
-            'Display device not found. `--show` is forced to False',
-            logger='current',
-            level=logging.WARNING)
-        call_args['show'] = False
-
-    return init_args, call_args
-
-
-def main():
-    # TODO: Support inference of point cloud numpy file.
-    init_args, call_args = parse_args()
-
-    inferencer = MonoDet3DInferencer(**init_args)
-    inferencer(**call_args)
-
-    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
-                                           and call_args['no_save_pred']):
-        print_log(
-            f'results have been saved at {call_args["out_dir"]}',
-            logger='current')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/new_multi_modality_demo.py b/demo/new_multi_modality_demo.py
deleted file mode 100644
index 3b4605aeeb..0000000000
--- a/demo/new_multi_modality_demo.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import os
-from argparse import ArgumentParser
-
-from mmengine.logging import print_log
-
-from mmdet3d.apis import MultiModalityDet3DInferencer
-
-
-def parse_args():
-    parser = ArgumentParser()
-    parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('img', help='Image file')
-    parser.add_argument('infos', help='Infos file with annotations')
-    parser.add_argument('model', help='Config file')
-    parser.add_argument('weights', help='Checkpoint file')
-    parser.add_argument(
-        '--device', default='cuda:0', help='Device used for inference')
-    parser.add_argument(
-        '--cam-type',
-        type=str,
-        default='CAM_BACK',
-        help='choose camera type to inference')
-    parser.add_argument(
-        '--pred-score-thr',
-        type=float,
-        default=0.3,
-        help='bbox score threshold')
-    parser.add_argument(
-        '--out-dir',
-        type=str,
-        default='outputs',
-        help='Output directory of prediction results.')
-    parser.add_argument(
-        '--show',
-        action='store_true',
-        help='Show online visualization results')
-    parser.add_argument(
-        '--wait_time',
-        type=float,
-        default=-1,
-        help='The interval of show (s). Demo will be blocked in showing'
-        'results, if wait_time is -1. Defaults to -1.')
-    parser.add_argument(
-        '--no-save-vis',
-        action='store_true',
-        help='Do not save detection vis results')
-    parser.add_argument(
-        '--no-save-pred',
-        action='store_true',
-        help='Do not save detection json results')
-    parser.add_argument(
-        '--print-result',
-        action='store_true',
-        help='Whether to print the results.')
-    call_args = vars(parser.parse_args())
-
-    call_args['inputs'] = dict(
-        points=call_args.pop('pcd'),
-        img=call_args.pop('img'),
-        infos=call_args.pop('infos'))
-    call_args.pop('cam_type')
-
-    if call_args['no_save_vis'] and call_args['no_save_pred']:
-        call_args['out_dir'] = ''
-
-    init_kws = ['model', 'weights', 'device']
-    init_args = {}
-    for init_kw in init_kws:
-        init_args[init_kw] = call_args.pop(init_kw)
-
-    # NOTE: If your operating environment does not have a display device,
-    # (e.g. a remote server), you can save the predictions and visualize
-    # them in local devices.
-    if os.environ.get('DISPLAY') is None and call_args['show']:
-        print_log(
-            'Display device not found. `--show` is forced to False',
-            logger='current',
-            level=logging.WARNING)
-        call_args['show'] = False
-
-    return init_args, call_args
-
-
-def main():
-    # TODO: Support inference of point cloud numpy file.
-    init_args, call_args = parse_args()
-
-    inferencer = MultiModalityDet3DInferencer(**init_args)
-    inferencer(**call_args)
-
-    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
-                                           and call_args['no_save_pred']):
-        print_log(
-            f'results have been saved at {call_args["out_dir"]}',
-            logger='current')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/new_pcd_demo.py b/demo/new_pcd_demo.py
deleted file mode 100644
index d7d850bc12..0000000000
--- a/demo/new_pcd_demo.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import os
-from argparse import ArgumentParser
-
-from mmengine.logging import print_log
-
-from mmdet3d.apis import LidarDet3DInferencer
-
-
-def parse_args():
-    parser = ArgumentParser()
-    parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('model', help='Config file')
-    parser.add_argument('weights', help='Checkpoint file')
-    parser.add_argument(
-        '--device', default='cuda:0', help='Device used for inference')
-    parser.add_argument(
-        '--pred-score-thr',
-        type=float,
-        default=0.3,
-        help='bbox score threshold')
-    parser.add_argument(
-        '--out-dir',
-        type=str,
-        default='outputs',
-        help='Output directory of prediction results.')
-    parser.add_argument(
-        '--show',
-        action='store_true',
-        help='Show online visualization results')
-    parser.add_argument(
-        '--wait_time',
-        type=float,
-        default=-1,
-        help='The interval of show (s). Demo will be blocked in showing'
-        'results, if wait_time is -1. Defaults to -1.')
-    parser.add_argument(
-        '--no-save-vis',
-        action='store_true',
-        help='Do not save detection vis results')
-    parser.add_argument(
-        '--no-save-pred',
-        action='store_true',
-        help='Do not save detection json results')
-    parser.add_argument(
-        '--print-result',
-        action='store_true',
-        help='Whether to print the results.')
-    call_args = vars(parser.parse_args())
-
-    call_args['inputs'] = dict(points=call_args.pop('pcd'))
-
-    if call_args['no_save_vis'] and call_args['no_save_pred']:
-        call_args['out_dir'] = ''
-
-    init_kws = ['model', 'weights', 'device']
-    init_args = {}
-    for init_kw in init_kws:
-        init_args[init_kw] = call_args.pop(init_kw)
-
-    # NOTE: If your operating environment does not have a display device,
-    # (e.g. a remote server), you can save the predictions and visualize
-    # them in local devices.
-    if os.environ.get('DISPLAY') is None and call_args['show']:
-        print_log(
-            'Display device not found. `--show` is forced to False',
-            logger='current',
-            level=logging.WARNING)
-        call_args['show'] = False
-
-    return init_args, call_args
-
-
-def main():
-    # TODO: Support inference of point cloud numpy file.
-    init_args, call_args = parse_args()
-
-    inferencer = LidarDet3DInferencer(**init_args)
-    inferencer(**call_args)
-
-    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
-                                           and call_args['no_save_pred']):
-        print_log(
-            f'results have been saved at {call_args["out_dir"]}',
-            logger='current')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/new_pcd_seg_demo.py b/demo/new_pcd_seg_demo.py
deleted file mode 100644
index bc6697e165..0000000000
--- a/demo/new_pcd_seg_demo.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import os
-from argparse import ArgumentParser
-
-from mmengine.logging import print_log
-
-from mmdet3d.apis import LidarSeg3DInferencer
-
-
-def parse_args():
-    parser = ArgumentParser()
-    parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('model', help='Config file')
-    parser.add_argument('weights', help='Checkpoint file')
-    parser.add_argument(
-        '--device', default='cuda:0', help='Device used for inference')
-    parser.add_argument(
-        '--out-dir',
-        type=str,
-        default='outputs',
-        help='Output directory of prediction results.')
-    parser.add_argument(
-        '--show',
-        action='store_true',
-        help='Show online visualization results')
-    parser.add_argument(
-        '--wait_time',
-        type=float,
-        default=-1,
-        help='The interval of show (s). Demo will be blocked in showing'
-        'results, if wait_time is -1. Defaults to -1.')
-    parser.add_argument(
-        '--no-save-vis',
-        action='store_true',
-        help='Do not save detection vis results')
-    parser.add_argument(
-        '--no-save-pred',
-        action='store_true',
-        help='Do not save detection json results')
-    parser.add_argument(
-        '--print-result',
-        action='store_true',
-        help='Whether to print the results.')
-    call_args = vars(parser.parse_args())
-
-    call_args['inputs'] = dict(points=call_args.pop('pcd'))
-
-    if call_args['no_save_vis'] and call_args['no_save_pred']:
-        call_args['out_dir'] = ''
-
-    init_kws = ['model', 'weights', 'device']
-    init_args = {}
-    for init_kw in init_kws:
-        init_args[init_kw] = call_args.pop(init_kw)
-
-    # NOTE: If your operating environment does not have a display device,
-    # (e.g. a remote server), you can save the predictions and visualize
-    # them in local devices.
-    if os.environ.get('DISPLAY') is None and call_args['show']:
-        print_log(
-            'Display device not found. `--show` is forced to False',
-            logger='current',
-            level=logging.WARNING)
-        call_args['show'] = False
-
-    return init_args, call_args
-
-
-def main():
-    # TODO: Support inference of point cloud numpy file.
-    init_args, call_args = parse_args()
-
-    inferencer = LidarSeg3DInferencer(**init_args)
-    inferencer(**call_args)
-
-    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
-                                           and call_args['no_save_pred']):
-        print_log(
-            f'results have been saved at {call_args["out_dir"]}',
-            logger='current')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/pcd_demo.py b/demo/pcd_demo.py
index 6c9266d4e0..d7d850bc12 100644
--- a/demo/pcd_demo.py
+++ b/demo/pcd_demo.py
@@ -1,60 +1,90 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
 from argparse import ArgumentParser
 
-from mmdet3d.apis import inference_detector, init_model
-from mmdet3d.registry import VISUALIZERS
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarDet3DInferencer
 
 
 def parse_args():
     parser = ArgumentParser()
     parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('config', help='Config file')
-    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
     parser.add_argument(
         '--device', default='cuda:0', help='Device used for inference')
     parser.add_argument(
-        '--score-thr', type=float, default=0.0, help='bbox score threshold')
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
     parser.add_argument(
-        '--out-dir', type=str, default='demo', help='dir to save results')
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction results.')
     parser.add_argument(
         '--show',
         action='store_true',
-        help='show online visualization results')
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait_time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
     parser.add_argument(
-        '--snapshot',
+        '--no-save-pred',
         action='store_true',
-        help='whether to save online visualization results')
-    args = parser.parse_args()
-    return args
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
 
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
 
-def main(args):
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
     # TODO: Support inference of point cloud numpy file.
-    # build the model from a config file and a checkpoint file
-    model = init_model(args.config, args.checkpoint, device=args.device)
-
-    # init visualizer
-    visualizer = VISUALIZERS.build(model.cfg.visualizer)
-    visualizer.dataset_meta = model.dataset_meta
-
-    # test a single point cloud sample
-    result, data = inference_detector(model, args.pcd)
-    points = data['inputs']['points']
-    data_input = dict(points=points)
-
-    # show the results
-    visualizer.add_datasample(
-        'result',
-        data_input,
-        data_sample=result,
-        draw_gt=False,
-        show=args.show,
-        wait_time=-1,
-        out_file=args.out_dir,
-        pred_score_thr=args.score_thr,
-        vis_task='lidar_det')
+    init_args, call_args = parse_args()
+
+    inferencer = LidarDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
 
 
 if __name__ == '__main__':
-    args = parse_args()
-    main(args)
+    main()
diff --git a/demo/pcd_seg_demo.py b/demo/pcd_seg_demo.py
index 2f1438604b..bc6697e165 100644
--- a/demo/pcd_seg_demo.py
+++ b/demo/pcd_seg_demo.py
@@ -1,55 +1,85 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
 from argparse import ArgumentParser
 
-from mmdet3d.apis import inference_segmentor, init_model
-from mmdet3d.registry import VISUALIZERS
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarSeg3DInferencer
 
 
 def parse_args():
     parser = ArgumentParser()
     parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('config', help='Config file')
-    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
     parser.add_argument(
         '--device', default='cuda:0', help='Device used for inference')
     parser.add_argument(
-        '--out-dir', type=str, default='demo', help='dir to save results')
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction results.')
     parser.add_argument(
         '--show',
         action='store_true',
-        help='show online visualization results')
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait_time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
     parser.add_argument(
-        '--snapshot',
+        '--no-save-pred',
         action='store_true',
-        help='whether to save online visualization results')
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    # build the model from a config file and a checkpoint file
-    model = init_model(args.config, args.checkpoint, device=args.device)
-
-    # init visualizer
-    visualizer = VISUALIZERS.build(model.cfg.visualizer)
-    visualizer.dataset_meta = model.dataset_meta
-
-    # test a single point cloud sample
-    result, data = inference_segmentor(model, args.pcd)
-    points = data['inputs']['points']
-    data_input = dict(points=points)
-    # show the results
-    visualizer.add_datasample(
-        'result',
-        data_input,
-        data_sample=result,
-        draw_gt=False,
-        show=args.show,
-        wait_time=-1,
-        out_file=args.out_dir,
-        vis_task='lidar_seg')
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = LidarSeg3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
 
 
 if __name__ == '__main__':
-    args = parse_args()
-    main(args)
+    main()

From 53515b949247dd086d77cafd6309cf6596f10293 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Mon, 16 Oct 2023 11:54:00 +0800
Subject: [PATCH 05/11] fix det version

---
 mmdet3d/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mmdet3d/__init__.py b/mmdet3d/__init__.py
index 3fb8ca0e2b..9584e57741 100644
--- a/mmdet3d/__init__.py
+++ b/mmdet3d/__init__.py
@@ -14,8 +14,8 @@
 mmengine_maximum_version = '1.0.0'
 mmengine_version = digit_version(mmengine.__version__)
 
-mmdet_minimum_version = '3.0.0'
-mmdet_maximum_version = '3.2.0'
+mmdet_minimum_version = '3.0.0rc5'
+mmdet_maximum_version = '3.3.0'
 mmdet_version = digit_version(mmdet.__version__)
 
 assert (mmcv_version >= digit_version(mmcv_minimum_version)

From 114059e3000fff0d4151ef97d8a41c36bf524406 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Mon, 16 Oct 2023 17:12:22 +0800
Subject: [PATCH 06/11] fix ut and fix input_to_list

---
 .../apis/inferencers/base_3d_inferencer.py    | 78 ----------------
 .../inferencers/lidar_det3d_inferencer.py     | 19 +++-
 .../inferencers/lidar_seg3d_inferencer.py     | 19 +++-
 .../apis/inferencers/mono_det3d_inferencer.py | 81 ++++++++++++++++-
 .../multi_modality_det3d_inferencer.py        | 89 ++++++++++++++++++-
 .../test_lidar_det3d_inferencer.py            | 14 ++-
 .../test_lidar_seg3d_inferencer.py            | 14 ++-
 .../test_mono_det3d_inferencer.py             | 50 +++++------
 .../test_multi_modality_det3d_inferencer.py   | 46 +++++-----
 9 files changed, 259 insertions(+), 151 deletions(-)

diff --git a/mmdet3d/apis/inferencers/base_3d_inferencer.py b/mmdet3d/apis/inferencers/base_3d_inferencer.py
index 8996bb0bab..d8a39fb3fc 100644
--- a/mmdet3d/apis/inferencers/base_3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/base_3d_inferencer.py
@@ -4,12 +4,9 @@
 from copy import deepcopy
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
-import mmengine
 import numpy as np
 import torch.nn as nn
 from mmengine import dump, print_log
-from mmengine.fileio import (get_file_backend, isdir, join_path,
-                             list_dir_or_file)
 from mmengine.infer.infer import BaseInferencer, ModelType
 from mmengine.model.utils import revert_sync_batchnorm
 from mmengine.registry import init_default_scope
@@ -132,81 +129,6 @@ def _init_model(
         model.eval()
         return model
 
-    def _inputs_to_list(self,
-                        inputs: Union[dict, list],
-                        modality_key: Union[str, List[str]] = 'points',
-                        cam_type='CAM2') -> list:
-        """Preprocess the inputs to a list.
-
-        Preprocess inputs to a list according to its type:
-
-        - list or tuple: return inputs
-        - dict: the value of key 'points'/`img` is
-            - Directory path: return all files in the directory
-            - other cases: return a list containing the string. The string
-              could be a path to file, a url or other types of string according
-              to the task.
-
-        Args:
-            inputs (Union[dict, list]): Inputs for the inferencer.
-            modality_key (Union[str, List[str]]): The key of the modality.
-                Defaults to 'points'.
-
-        Returns:
-            list: List of input for the :meth:`preprocess`.
-        """
-        if isinstance(modality_key, str):
-            modality_key = [modality_key]
-        assert set(modality_key).issubset({'points', 'img'})
-
-        if 'infos' in inputs:
-            infos = inputs.pop('infos')
-        else:
-            infos = None
-
-        for key in modality_key:
-            if isinstance(inputs, dict) and isinstance(inputs[key], str):
-                img = inputs[key]
-                backend = get_file_backend(img)
-                if hasattr(backend, 'isdir') and isdir(img):
-                    # Backends like HttpsBackend do not implement `isdir`, so
-                    # only those backends that implement `isdir` could accept
-                    # the inputs as a directory
-                    filename_list = list_dir_or_file(img, list_dir=False)
-                    inputs = [{
-                        f'{key}': join_path(img, filename)
-                    } for filename in filename_list]
-
-        if not isinstance(inputs, (list, tuple)):
-            inputs = [inputs]
-
-        # get cam2img, lidar2cam and lidar2img from infos
-        if infos is not None:
-            info_list = mmengine.load(infos)['data_list']
-            assert len(info_list) == len(inputs)
-            for index, input in enumerate(inputs):
-                data_info = info_list[index]
-                img_path = data_info['images'][cam_type]['img_path']
-                if osp.basename(img_path) != osp.basename(input['img']):
-                    raise ValueError(
-                        f'the info file of {img_path} is not provided.')
-                cam2img = np.asarray(
-                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
-                lidar2cam = np.asarray(
-                    data_info['images'][cam_type]['lidar2cam'],
-                    dtype=np.float32)
-                if 'lidar2img' in data_info['images'][cam_type]:
-                    lidar2img = np.asarray(
-                        data_info['images'][cam_type]['lidar2img'],
-                        dtype=np.float32)
-                else:
-                    lidar2img = cam2img @ lidar2cam
-                input['cam2img'] = cam2img
-                input['lidar2cam'] = lidar2cam
-                input['lidar2img'] = lidar2img
-
-        return list(inputs)
-
     def _get_transform_idx(self, pipeline_cfg: ConfigType, name: str) -> int:
         """Returns the index of the transform in a pipeline.
 
diff --git a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
index 128c53fb81..9869a87a28 100644
--- a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
@@ -6,6 +6,8 @@
 import numpy as np
 import torch
 from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData
 
@@ -80,7 +82,22 @@ def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
         Returns:
             list: List of input for the :meth:`preprocess`.
         """
-        return super()._inputs_to_list(inputs, modality_key='points', **kwargs)
+        if isinstance(inputs, dict) and isinstance(inputs['points'], str):
+            pcd = inputs['points']
+            backend = get_file_backend(pcd)
+            if hasattr(backend, 'isdir') and isdir(pcd):
+                # Backends like HttpsBackend do not implement `isdir`, so
+                # only those backends that implement `isdir` could accept
+                # the inputs as a directory
+                filename_list = list_dir_or_file(pcd, list_dir=False)
+                inputs = [{
+                    'points': join_path(pcd, filename)
+                } for filename in filename_list]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
 
     def _init_pipeline(self, cfg: ConfigType) -> Compose:
         """Initialize the test pipeline."""
diff --git a/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py b/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
index 2adb998e4a..328d865e0d 100644
--- a/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
@@ -5,6 +5,8 @@
 import mmengine
 import numpy as np
 from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData
 
@@ -77,7 +79,22 @@ def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
         Returns:
             list: List of input for the :meth:`preprocess`.
         """
-        return super()._inputs_to_list(inputs, modality_key='points', **kwargs)
+        if isinstance(inputs, dict) and isinstance(inputs['points'], str):
+            pcd = inputs['points']
+            backend = get_file_backend(pcd)
+            if hasattr(backend, 'isdir') and isdir(pcd):
+                # Backends like HttpsBackend do not implement `isdir`, so
+                # only those backends that implement `isdir` could accept
+                # the inputs as a directory
+                filename_list = list_dir_or_file(pcd, list_dir=False)
+                inputs = [{
+                    'points': join_path(pcd, filename)
+                } for filename in filename_list]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
 
     def _init_pipeline(self, cfg: ConfigType) -> Compose:
         """Initialize the test pipeline."""
diff --git a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
index d4e231808b..9a65883aed 100644
--- a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
@@ -6,6 +6,8 @@
 import mmengine
 import numpy as np
 from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData
 
@@ -60,7 +62,10 @@ def __init__(self,
             scope=scope,
             palette=palette)
 
-    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
+    def _inputs_to_list(self,
+                        inputs: Union[dict, list],
+                        cam_type='CAM2',
+                        **kwargs) -> list:
         """Preprocess the inputs to a list.
 
         Preprocess inputs to a list according to its type:
@@ -78,7 +83,79 @@ def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
         Returns:
             list: List of input for the :meth:`preprocess`.
         """
-        return super()._inputs_to_list(inputs, modality_key='img', **kwargs)
+        if isinstance(inputs, dict):
+            assert 'infos' in inputs
+            infos = inputs.pop('infos')
+
+            if isinstance(inputs['img'], str):
+                img = inputs['img']
+                backend = get_file_backend(img)
+                if hasattr(backend, 'isdir') and isdir(img):
+                    # Backends like HttpsBackend do not implement `isdir`, so
+                    # only those backends that implement `isdir` could accept
+                    # the inputs as a directory
+                    filename_list = list_dir_or_file(img, list_dir=False)
+                    inputs = [{
+                        'img': join_path(img, filename)
+                    } for filename in filename_list]
+
+            if not isinstance(inputs, (list, tuple)):
+                inputs = [inputs]
+
+            # get cam2img, lidar2cam and lidar2img from infos
+            info_list = mmengine.load(infos)['data_list']
+            assert len(info_list) == len(inputs)
+            for index, input in enumerate(inputs):
+                data_info = info_list[index]
+                img_path = data_info['images'][cam_type]['img_path']
+                if osp.isfile(input['img']) and osp.basename(
+                        img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+        elif isinstance(inputs, (list, tuple)):
+            # get cam2img, lidar2cam and lidar2img from infos
+            for input in inputs:
+                assert 'infos' in input
+                infos = input.pop('infos')
+                info_list = mmengine.load(infos)['data_list']
+                assert len(info_list) == 1, 'Only support single sample info' \
+                    'in `.pkl`, when inputs is a list.'
+                data_info = info_list[0]
+                img_path = data_info['images'][cam_type]['img_path']
+                if osp.isfile(input['img']) and osp.basename(
+                        img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+
+        return list(inputs)
 
     def _init_pipeline(self, cfg: ConfigType) -> Compose:
         """Initialize the test pipeline."""
diff --git a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
index 7f00dd03ff..b934ed9e21 100644
--- a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
@@ -7,6 +7,8 @@
 import mmengine
 import numpy as np
 from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData
 
@@ -60,7 +62,10 @@ def __init__(self,
             scope=scope,
             palette=palette)
 
-    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
+    def _inputs_to_list(self,
+                        inputs: Union[dict, list],
+                        cam_type: str = 'CAM2',
+                        **kwargs) -> list:
         """Preprocess the inputs to a list.
 
         Preprocess inputs to a list according to its type:
@@ -78,8 +83,86 @@ def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
         Returns:
             list: List of input for the :meth:`preprocess`.
         """
-        return super()._inputs_to_list(
-            inputs, modality_key=['points', 'img'], **kwargs)
+        if isinstance(inputs, dict):
+            assert 'infos' in inputs
+            infos = inputs.pop('infos')
+
+            if isinstance(inputs['img'], str):
+                img, pcd = inputs['img'], inputs['points']
+                backend = get_file_backend(img)
+                if hasattr(backend, 'isdir') and isdir(img) and isdir(pcd):
+                    # Backends like HttpsBackend do not implement `isdir`, so
+                    # only those backends that implement `isdir` could accept
+                    # the inputs as a directory
+                    img_filename_list = list_dir_or_file(
+                        img, list_dir=False, suffix=['.png', '.jpg'])
+                    pcd_filename_list = list_dir_or_file(
+                        pcd, list_dir=False, suffix='.bin')
+                    assert len(img_filename_list) == len(pcd_filename_list)
+
+                    inputs = [{
+                        'img': join_path(img, img_filename),
+                        'points': join_path(pcd, pcd_filename)
+                    } for pcd_filename, img_filename in zip(
+                        pcd_filename_list, img_filename_list)]
+
+            if not isinstance(inputs, (list, tuple)):
+                inputs = [inputs]
+
+            # get cam2img, lidar2cam and lidar2img from infos
+            info_list = mmengine.load(infos)['data_list']
+            assert len(info_list) == len(inputs)
+            for index, input in enumerate(inputs):
+                data_info = info_list[index]
+                img_path = data_info['images'][cam_type]['img_path']
+                if osp.isfile(input['img']) and osp.basename(
+                        img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+        elif isinstance(inputs, (list, tuple)):
+            # get cam2img, lidar2cam and lidar2img from infos
+            for input in inputs:
+                assert 'infos' in input
+                infos = input.pop('infos')
+                info_list = mmengine.load(infos)['data_list']
+                assert len(info_list) == 1, 'Only support single sample' \
+                    'info in `.pkl`, when input is a list.'
+                data_info = info_list[0]
+                img_path = data_info['images'][cam_type]['img_path']
+                if osp.isfile(input['img']) and osp.basename(
+                        img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+
+        return list(inputs)
 
     def _init_pipeline(self, cfg: ConfigType) -> Compose:
         """Initialize the test pipeline."""
diff --git a/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py b/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py
index 7d76b627ac..f4fc771606 100644
--- a/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py
+++ b/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py
@@ -89,7 +89,7 @@ def test_visualize(self):
         inputs = dict(points='tests/data/kitti/training/velodyne/000000.bin'),
         # img_out_dir
         with tempfile.TemporaryDirectory() as tmp_dir:
-            self.inferencer(inputs, img_out_dir=tmp_dir)
+            self.inferencer(inputs, out_dir=tmp_dir)
             # TODO: For LiDAR-based detection, the saved image only exists when
             # show=True.
             # self.assertTrue(osp.exists(osp.join(tmp_dir, '000000.png')))
@@ -102,11 +102,9 @@ def test_postprocess(self):
         res = self.inferencer(inputs, return_datasamples=True)
         self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))
 
-        # pred_out_file
+        # pred_out_dir
         with tempfile.TemporaryDirectory() as tmp_dir:
-            pred_out_file = osp.join(tmp_dir, 'tmp.json')
-            res = self.inferencer(
-                inputs, print_result=True, pred_out_file=pred_out_file)
-            dumped_res = mmengine.load(pred_out_file)
-            self.assert_predictions_equal(res['predictions'],
-                                          dumped_res['predictions'])
+            res = self.inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', '000000.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)
diff --git a/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py b/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py
index f0bd72b163..9bbdb037a1 100644
--- a/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py
+++ b/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py
@@ -91,7 +91,7 @@ def test_visualizer(self):
         inputs = dict(points='tests/data/s3dis/points/Area_1_office_2.bin')
         # img_out_dir
         with tempfile.TemporaryDirectory() as tmp_dir:
-            self.inferencer(inputs, img_out_dir=tmp_dir)
+            self.inferencer(inputs, out_dir=tmp_dir)
 
     def test_post_processor(self):
         if not torch.cuda.is_available():
@@ -101,11 +101,9 @@ def test_post_processor(self):
         res = self.inferencer(inputs, return_datasamples=True)
         self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))
 
-        # pred_out_file
+        # pred_out_dir
         with tempfile.TemporaryDirectory() as tmp_dir:
-            pred_out_file = osp.join(tmp_dir, 'tmp.json')
-            res = self.inferencer(
-                inputs, print_result=True, pred_out_file=pred_out_file)
-            dumped_res = mmengine.load(pred_out_file)
-            self.assert_predictions_equal(res['predictions'],
-                                          dumped_res['predictions'])
+            res = self.inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', 'Area_1_office_2.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)
diff --git a/tests/test_apis/test_inferencers/test_mono_det3d_inferencer.py b/tests/test_apis/test_inferencers/test_mono_det3d_inferencer.py
index 71df09f78f..d30f156485 100644
--- a/tests/test_apis/test_inferencers/test_mono_det3d_inferencer.py
+++ b/tests/test_apis/test_inferencers/test_mono_det3d_inferencer.py
@@ -42,13 +42,13 @@ def assert_predictions_equal(self, preds1, preds2):
     def test_call(self, model):
         # single img
         img_path = 'demo/data/kitti/000008.png'
-        calib_path = 'demo/data/kitti/000008.txt'
+        infos_path = 'demo/data/kitti/000008.pkl'
         inferencer = MonoDet3DInferencer(model)
-        inputs = dict(img=img_path, calib=calib_path)
+        inputs = dict(img=img_path, infos=infos_path)
         res_path = inferencer(inputs, return_vis=True)
         # ndarray
         img = mmcv.imread(img_path)
-        inputs = dict(img=img, calib=calib_path)
+        inputs = dict(img=img, infos=infos_path)
         res_ndarray = inferencer(inputs, return_vis=True)
         self.assert_predictions_equal(res_path['predictions'],
                                       res_ndarray['predictions'])
@@ -59,16 +59,18 @@ def test_call(self, model):
         inputs = [
             dict(
                 img='demo/data/kitti/000008.png',
-                calib='demo/data/kitti/000008.txt'),
+                infos='demo/data/kitti/000008.pkl'),
             dict(
                 img='demo/data/kitti/000008.png',
-                calib='demo/data/kitti/000008.txt')
+                infos='demo/data/kitti/000008.pkl')
         ]
         res_path = inferencer(inputs, return_vis=True)
         # list of ndarray
         imgs = [mmcv.imread(p['img']) for p in inputs]
-        inputs[0]['img'] = imgs[0]
-        inputs[1]['img'] = imgs[1]
+        inputs = [
+            dict(img=imgs[0], infos='demo/data/kitti/000008.pkl'),
+            dict(img=imgs[1], infos='demo/data/kitti/000008.pkl')
+        ]
         res_ndarray = inferencer(inputs, return_vis=True)
         self.assert_predictions_equal(res_path['predictions'],
                                       res_ndarray['predictions'])
@@ -77,36 +79,30 @@ def test_call(self, model):
 
     @parameterized.expand(['pgd_kitti'])
     def test_visualize(self, model):
-        inputs = [
-            dict(
-                img='demo/data/kitti/000008.png',
-                calib='demo/data/kitti/000008.txt'),
-            dict(
-                img='demo/data/kitti/000008.png',
-                calib='demo/data/kitti/000008.txt')
-        ]
+        inputs = dict(
+            img='demo/data/kitti/000008.png',
+            infos='demo/data/kitti/000008.pkl')
         inferencer = MonoDet3DInferencer(model)
         # img_out_dir
         with tempfile.TemporaryDirectory() as tmp_dir:
-            inferencer(inputs, img_out_dir=tmp_dir)
-            for img_dir in ['000008.png', '000008.png']:
-                self.assertTrue(osp.exists(osp.join(tmp_dir, img_dir)))
+            inferencer(inputs, out_dir=tmp_dir)
+            self.assertTrue(
+                osp.exists(osp.join(tmp_dir, 'vis_camera/CAM2/000008.png')))
 
     @parameterized.expand(['pgd_kitti'])
     def test_postprocess(self, model):
         # return_datasample
         img_path = 'demo/data/kitti/000008.png'
-        calib_path = 'demo/data/kitti/000008.txt'
-        inputs = dict(img=img_path, calib=calib_path)
+        infos_path = 'demo/data/kitti/000008.pkl'
+        inputs = dict(img=img_path, infos=infos_path)
         inferencer = MonoDet3DInferencer(model)
         res = inferencer(inputs, return_datasamples=True)
         self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))
 
-        # pred_out_file
+        # pred_out_dir
         with tempfile.TemporaryDirectory() as tmp_dir:
-            pred_out_file = osp.join(tmp_dir, 'tmp.json')
-            res = inferencer(
-                inputs, print_result=True, pred_out_file=pred_out_file)
-            dumped_res = mmengine.load(pred_out_file)
-            self.assert_predictions_equal(res['predictions'],
-                                          dumped_res['predictions'])
+            inputs = dict(img=img_path, infos=infos_path)
+            res = inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', '000008.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)
diff --git a/tests/test_apis/test_inferencers/test_multi_modality_det3d_inferencer.py b/tests/test_apis/test_inferencers/test_multi_modality_det3d_inferencer.py
index e0be781ca2..c51f7c764b 100644
--- a/tests/test_apis/test_inferencers/test_multi_modality_det3d_inferencer.py
+++ b/tests/test_apis/test_inferencers/test_multi_modality_det3d_inferencer.py
@@ -44,11 +44,11 @@ def assert_predictions_equal(self, preds1, preds2):
     def test_call(self):
         if not torch.cuda.is_available():
             return
-        calib_path = 'tests/data/kitti/training/calib/000000.pkl'
-        points_path = 'tests/data/kitti/training/velodyne/000000.bin'
-        img_path = 'tests/data/kitti/training/image_2/000000.png'
+        infos_path = 'demo/data/kitti/000008.pkl'
+        points_path = 'demo/data/kitti/000008.bin'
+        img_path = 'demo/data/kitti/000008.png'
         # single img & point cloud
-        inputs = dict(points=points_path, img=img_path, calib=calib_path)
+        inputs = dict(points=points_path, img=img_path, infos=infos_path)
         res_path = self.inferencer(inputs, return_vis=True)
 
         # ndarray
@@ -57,7 +57,7 @@ def test_call(self):
         points = points.reshape(-1, 4)
         points = points[:, :4]
         img = mmcv.imread(inputs['img'])
-        inputs = dict(points=points, img=img, calib=calib_path)
+        inputs = dict(points=points, img=img, infos=infos_path)
         res_ndarray = self.inferencer(inputs, return_vis=True)
         self.assert_predictions_equal(res_path['predictions'],
                                       res_ndarray['predictions'])
@@ -66,8 +66,8 @@ def test_call(self):
 
         # multiple imgs & point clouds
         inputs = [
-            dict(points=points_path, img=img_path, calib=calib_path),
-            dict(points=points_path, img=img_path, calib=calib_path)
+            dict(points=points_path, img=img_path, infos=infos_path),
+            dict(points=points_path, img=img_path, infos=infos_path)
         ]
         res_path = self.inferencer(inputs, return_vis=True)
         # list of ndarray
@@ -77,7 +77,7 @@ def test_call(self):
             points = np.frombuffer(pts_bytes, dtype=np.float32)
             points = points.reshape(-1, 4)
             img = mmcv.imread(p['img'])
-            all_inputs.append(dict(points=points, img=img, calib=p['calib']))
+            all_inputs.append(dict(points=points, img=img, infos=infos_path))
 
         res_ndarray = self.inferencer(all_inputs, return_vis=True)
         self.assert_predictions_equal(res_path['predictions'],
@@ -89,12 +89,12 @@ def test_visualize(self):
         if not torch.cuda.is_available():
             return
         inputs = dict(
-            points='tests/data/kitti/training/velodyne/000000.bin',
-            img='tests/data/kitti/training/image_2/000000.png',
-            calib='tests/data/kitti/training/calib/000000.pkl'),
+            points='demo/data/kitti/000008.bin',
+            img='demo/data/kitti/000008.png',
+            infos='demo/data/kitti/000008.pkl'),
         # img_out_dir
         with tempfile.TemporaryDirectory() as tmp_dir:
-            self.inferencer(inputs, img_out_dir=tmp_dir)
+            self.inferencer(inputs, out_dir=tmp_dir)
             # TODO: For results of LiDAR-based detection, the saved image only
             # exists when show=True.
             # self.assertTrue(osp.exists(osp.join(tmp_dir, '000000.png')))
@@ -103,18 +103,18 @@ def test_postprocess(self):
         if not torch.cuda.is_available():
             return
         # return_datasample
-        inputs = dict(
-            points='tests/data/kitti/training/velodyne/000000.bin',
-            img='tests/data/kitti/training/image_2/000000.png',
-            calib='tests/data/kitti/training/calib/000000.pkl')
+        infos_path = 'demo/data/kitti/000008.pkl'
+        points_path = 'demo/data/kitti/000008.bin'
+        img_path = 'demo/data/kitti/000008.png'
+        # single img & point cloud
+        inputs = dict(points=points_path, img=img_path, infos=infos_path)
         res = self.inferencer(inputs, return_datasamples=True)
         self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))
 
-        # pred_out_file
+        # pred_out_dir
         with tempfile.TemporaryDirectory() as tmp_dir:
-            pred_out_file = osp.join(tmp_dir, 'tmp.json')
-            res = self.inferencer(
-                inputs, print_result=True, pred_out_file=pred_out_file)
-            dumped_res = mmengine.load(pred_out_file)
-            self.assert_predictions_equal(res['predictions'],
-                                          dumped_res['predictions'])
+            inputs = dict(points=points_path, img=img_path, infos=infos_path)
+            res = self.inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', '000008.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)

From 0b42058030bb76d7c64101b6f54543af13ad6a1a Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Mon, 16 Oct 2023 17:45:49 +0800
Subject: [PATCH 07/11] resolve comments

---
 demo/mono_det_demo.py                                  |  8 ++++----
 demo/multi_modality_demo.py                            | 10 +++++-----
 demo/pcd_demo.py                                       |  8 ++++----
 demo/pcd_seg_demo.py                                   |  8 ++++----
 mmdet3d/apis/inferencers/base_3d_inferencer.py         |  2 +-
 mmdet3d/visualization/local_visualizer.py              |  5 ++++-
 .../{multi_modalit_demo.py => multi_modality_demo.py}  |  0
 7 files changed, 22 insertions(+), 19 deletions(-)
 rename projects/BEVFusion/demo/{multi_modalit_demo.py => multi_modality_demo.py} (100%)

diff --git a/demo/mono_det_demo.py b/demo/mono_det_demo.py
index 08ba916a8f..42416fd9f4 100644
--- a/demo/mono_det_demo.py
+++ b/demo/mono_det_demo.py
@@ -30,13 +30,13 @@ def parse_args():
         '--out-dir',
         type=str,
         default='outputs',
-        help='Output directory of prediction results.')
+        help='Output directory of prediction and visualization results.')
     parser.add_argument(
         '--show',
         action='store_true',
         help='Show online visualization results')
     parser.add_argument(
-        '--wait_time',
+        '--wait-time',
         type=float,
         default=-1,
         help='The interval of show (s). Demo will be blocked in showing'
@@ -44,11 +44,11 @@ def parse_args():
     parser.add_argument(
         '--no-save-vis',
         action='store_true',
-        help='Do not save detection vis results')
+        help='Do not save detection visualization results')
     parser.add_argument(
         '--no-save-pred',
         action='store_true',
-        help='Do not save detection json results')
+        help='Do not save detection prediction results')
     parser.add_argument(
         '--print-result',
         action='store_true',
diff --git a/demo/multi_modality_demo.py b/demo/multi_modality_demo.py
index 3b4605aeeb..e3c28cdc48 100644
--- a/demo/multi_modality_demo.py
+++ b/demo/multi_modality_demo.py
@@ -20,7 +20,7 @@ def parse_args():
     parser.add_argument(
         '--cam-type',
         type=str,
-        default='CAM_BACK',
+        default='CAM_FRONT',
         help='choose camera type to inference')
     parser.add_argument(
         '--pred-score-thr',
@@ -31,13 +31,13 @@ def parse_args():
         '--out-dir',
         type=str,
         default='outputs',
-        help='Output directory of prediction results.')
+        help='Output directory of prediction and visualization results.')
     parser.add_argument(
         '--show',
         action='store_true',
         help='Show online visualization results')
     parser.add_argument(
-        '--wait_time',
+        '--wait-time',
         type=float,
         default=-1,
         help='The interval of show (s). Demo will be blocked in showing'
@@ -45,11 +45,11 @@ def parse_args():
     parser.add_argument(
         '--no-save-vis',
         action='store_true',
-        help='Do not save detection vis results')
+        help='Do not save detection visualization results')
     parser.add_argument(
         '--no-save-pred',
         action='store_true',
-        help='Do not save detection json results')
+        help='Do not save detection prediction results')
     parser.add_argument(
         '--print-result',
         action='store_true',
diff --git a/demo/pcd_demo.py b/demo/pcd_demo.py
index d7d850bc12..8ff1090f35 100644
--- a/demo/pcd_demo.py
+++ b/demo/pcd_demo.py
@@ -24,13 +24,13 @@ def parse_args():
         '--out-dir',
         type=str,
         default='outputs',
-        help='Output directory of prediction results.')
+        help='Output directory of prediction and visualization results.')
     parser.add_argument(
         '--show',
         action='store_true',
         help='Show online visualization results')
     parser.add_argument(
-        '--wait_time',
+        '--wait-time',
         type=float,
         default=-1,
         help='The interval of show (s). Demo will be blocked in showing'
@@ -38,11 +38,11 @@ def parse_args():
     parser.add_argument(
         '--no-save-vis',
         action='store_true',
-        help='Do not save detection vis results')
+        help='Do not save detection visualization results')
     parser.add_argument(
         '--no-save-pred',
         action='store_true',
-        help='Do not save detection json results')
+        help='Do not save detection prediction results')
     parser.add_argument(
         '--print-result',
         action='store_true',
diff --git a/demo/pcd_seg_demo.py b/demo/pcd_seg_demo.py
index bc6697e165..574c21eb2d 100644
--- a/demo/pcd_seg_demo.py
+++ b/demo/pcd_seg_demo.py
@@ -19,13 +19,13 @@ def parse_args():
         '--out-dir',
         type=str,
         default='outputs',
-        help='Output directory of prediction results.')
+        help='Output directory of prediction and visualization results.')
     parser.add_argument(
         '--show',
         action='store_true',
         help='Show online visualization results')
     parser.add_argument(
-        '--wait_time',
+        '--wait-time',
         type=float,
         default=-1,
         help='The interval of show (s). Demo will be blocked in showing'
@@ -33,11 +33,11 @@ def parse_args():
     parser.add_argument(
         '--no-save-vis',
         action='store_true',
-        help='Do not save detection vis results')
+        help='Do not save detection visualization results')
     parser.add_argument(
         '--no-save-pred',
         action='store_true',
-        help='Do not save detection json results')
+        help='Do not save detection prediction results')
     parser.add_argument(
         '--print-result',
         action='store_true',
diff --git a/mmdet3d/apis/inferencers/base_3d_inferencer.py b/mmdet3d/apis/inferencers/base_3d_inferencer.py
index d8a39fb3fc..f3959ed2d3 100644
--- a/mmdet3d/apis/inferencers/base_3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/base_3d_inferencer.py
@@ -247,7 +247,7 @@ def postprocess(
                 Defaults to False.
             print_result (bool): Whether to print the inference result w/o
                 visualization to the console. Defaults to False.
-            pred_out_dir (str): Dir to save the inference results w/o
+            pred_out_dir (str): Directory to save the inference results w/o
                 visualization. If left as empty, no file will be saved.
                 Defaults to ''.
 
diff --git a/mmdet3d/visualization/local_visualizer.py b/mmdet3d/visualization/local_visualizer.py
index b8bb2cdd71..d1eff4af97 100644
--- a/mmdet3d/visualization/local_visualizer.py
+++ b/mmdet3d/visualization/local_visualizer.py
@@ -991,7 +991,10 @@ def add_datasample(self,
         gt_img_data = None
         pred_img_data = None
 
-        if not hasattr(self, 'o3d_vis'):
+        if not hasattr(self, 'o3d_vis') and vis_task in [
+                'multi-view_det', 'lidar_det', 'lidar_seg',
+                'multi-modality_det'
+        ]:
             self.o3d_vis = self._initialize_o3d_vis(show=show)
 
         if draw_gt and data_sample is not None:
diff --git a/projects/BEVFusion/demo/multi_modalit_demo.py b/projects/BEVFusion/demo/multi_modality_demo.py
similarity index 100%
rename from projects/BEVFusion/demo/multi_modalit_demo.py
rename to projects/BEVFusion/demo/multi_modality_demo.py

From b3fb73337ec11f188ca60e36dc8fb91dfe93d8f6 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Mon, 16 Oct 2023 18:20:16 +0800
Subject: [PATCH 08/11] fix os.isfile when python=3.7

---
 mmdet3d/apis/inferencers/mono_det3d_inferencer.py         | 8 ++++----
 .../apis/inferencers/multi_modality_det3d_inferencer.py   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
index 9a65883aed..dc15329ca9 100644
--- a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
@@ -108,8 +108,8 @@ def _inputs_to_list(self,
             for index, input in enumerate(inputs):
                 data_info = info_list[index]
                 img_path = data_info['images'][cam_type]['img_path']
-                if osp.isfile(input['img']) and osp.basename(
-                        img_path) != osp.basename(input['img']):
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
                     raise ValueError(
                         f'the info file of {img_path} is not provided.')
                 cam2img = np.asarray(
@@ -136,8 +136,8 @@ def _inputs_to_list(self,
                     'in `.pkl`, when inputs is a list.'
                 data_info = info_list[0]
                 img_path = data_info['images'][cam_type]['img_path']
-                if osp.isfile(input['img']) and osp.basename(
-                        img_path) != osp.basename(input['img']):
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
                     raise ValueError(
                         f'the info file of {img_path} is not provided.')
                 cam2img = np.asarray(
diff --git a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
index b934ed9e21..35f50c9f14 100644
--- a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
@@ -115,8 +115,8 @@ def _inputs_to_list(self,
             for index, input in enumerate(inputs):
                 data_info = info_list[index]
                 img_path = data_info['images'][cam_type]['img_path']
-                if osp.isfile(input['img']) and osp.basename(
-                        img_path) != osp.basename(input['img']):
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
                     raise ValueError(
                         f'the info file of {img_path} is not provided.')
                 cam2img = np.asarray(
@@ -143,8 +143,8 @@ def _inputs_to_list(self,
                     'info in `.pkl`, when input is a list.'
                 data_info = info_list[0]
                 img_path = data_info['images'][cam_type]['img_path']
-                if osp.isfile(input['img']) and osp.basename(
-                        img_path) != osp.basename(input['img']):
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
                     raise ValueError(
                         f'the info file of {img_path} is not provided.')
                 cam2img = np.asarray(

From 0709e04faa5873293700ca7a32f0b02d1d79da9f Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Tue, 17 Oct 2023 14:11:52 +0800
Subject: [PATCH 09/11] add note in get_started

---
 docs/en/get_started.md    | 4 ++++
 docs/zh_cn/get_started.md | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index c4f132553a..00437debb7 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -141,6 +141,10 @@ You will see a visualizer interface with point cloud, where bounding boxes are p
 
 **Note**:
 
+If you install MMDetection3D on a remote server without display device, you can leave out the `--show` argument. Demo will still save the predictions to  `outputs/pred/000008.json` file.
+
+**Note**:
+
 If you want to input a `.ply` file, you can use the following function and convert it to `.bin` format. Then you can use the converted `.bin` file to run demo.
 Note that you need to install `pandas` and `plyfile` before using this script. This function can also be used for data preprocessing for training `ply data`.
 
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index 9f0c1cb511..224a44c4ab 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -139,6 +139,10 @@ python demo/pcd_demo.py demo/data/kitti/000008.bin pointpillars_hv_secfpn_8xb6-1
 
 **注意**：
 
+如果你在没有显示设备的服务器上安装 MMDetection3D ，你可以忽略 `--show` 参数。Demo 仍会将预测结果保存到 `outputs/pred/000008.json` 文件中。
+
+**注意**：
+
 如果您想输入一个 `.ply` 文件，您可以使用如下函数将它转换成 `.bin` 格式。然后您可以使用转化的 `.bin` 文件来运行样例。请注意在使用此脚本之前，您需要安装 `pandas` 和 `plyfile`。这个函数也可以用于训练 `ply 数据`时作为数据预处理来使用。
 
 ```python

From 450676623e80d7db37429f17d27da8324bc0b84c Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 18 Oct 2023 17:02:43 +0800
Subject: [PATCH 10/11] fix docstring

---
 mmdet3d/apis/inferencers/base_3d_inferencer.py    |  6 +++---
 .../apis/inferencers/lidar_det3d_inferencer.py    | 15 +--------------
 .../apis/inferencers/lidar_seg3d_inferencer.py    |  1 +
 mmdet3d/apis/inferencers/mono_det3d_inferencer.py |  2 ++
 .../multi_modality_det3d_inferencer.py            |  1 +
 5 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/mmdet3d/apis/inferencers/base_3d_inferencer.py b/mmdet3d/apis/inferencers/base_3d_inferencer.py
index f3959ed2d3..6564325e8c 100644
--- a/mmdet3d/apis/inferencers/base_3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/base_3d_inferencer.py
@@ -153,7 +153,7 @@ def _dispatch_kwargs(self,
 
         Args:
             out_dir (str): Dir to save the inference results.
-            cam_type (str): Camera type. Defaults to 'CAM2'.
+            cam_type (str): Camera type. Defaults to ''.
             **kwargs (dict): Key words arguments passed to :meth:`preprocess`,
                 :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
                 Each key in kwargs should be in the corresponding set of
@@ -172,16 +172,16 @@ def _dispatch_kwargs(self,
 
     def __call__(self,
                  inputs: InputsType,
-                 batch_size=1,
+                 batch_size: int = 1,
                  return_datasamples: bool = False,
                  **kwargs) -> Optional[dict]:
         """Call the inferencer.
 
         Args:
             inputs (InputsType): Inputs for the inferencer.
+            batch_size (int): Batch size. Defaults to 1.
             return_datasamples (bool): Whether to return results as
                 :obj:`BaseDataElement`. Defaults to False.
-            batch_size (int): Batch size. Defaults to 1.
             **kwargs: Key words arguments passed to :meth:`preprocess`,
                 :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
                 Each key in kwargs should be in the corresponding set of
diff --git a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
index 9869a87a28..fc513e85d8 100644
--- a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
@@ -137,7 +137,7 @@ def visualize(self,
                 Defaults to False.
             show (bool): Whether to display the image in a popup window.
                 Defaults to False.
-            wait_time (float): The interval of show (s). Defaults to 0.
+            wait_time (float): The interval of show (s). Defaults to -1.
             draw_pred (bool): Whether to draw predicted bounding boxes.
                 Defaults to True.
             pred_score_thr (float): Minimum score of bboxes to draw.
@@ -211,19 +211,6 @@ def visualize_preds_fromfile(self, inputs: InputsType, preds: PredType,
         Args:
             inputs (InputsType): Inputs for the inferencer.
             preds (PredType): Predictions of the model.
-            return_vis (bool): Whether to return the visualization result.
-                Defaults to False.
-            show (bool): Whether to display the image in a popup window.
-                Defaults to False.
-            wait_time (float): The interval of show (s). Defaults to 0.
-            draw_pred (bool): Whether to draw predicted bounding boxes.
-                Defaults to True.
-            pred_score_thr (float): Minimum score of bboxes to draw.
-                Defaults to 0.3.
-            no_save_vis (bool): Whether to force not to save prediction
-                vis results. Defaults to False.
-            img_out_dir (str): Output directory of visualization results.
-                If left as empty, no file will be saved. Defaults to ''.
 
         Returns:
             List[np.ndarray] or None: Returns visualization results only if
diff --git a/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py b/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
index 328d865e0d..33ab95c104 100644
--- a/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
@@ -147,6 +147,7 @@ def visualize(self,
                 Defaults to True.
             pred_score_thr (float): Minimum score of bboxes to draw.
                 Defaults to 0.3.
+            no_save_vis (bool): Whether to save visualization results.
             img_out_dir (str): Output directory of visualization results.
                 If left as empty, no file will be saved. Defaults to ''.
 
diff --git a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
index dc15329ca9..22863ae166 100644
--- a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
@@ -194,8 +194,10 @@ def visualize(self,
                 Defaults to True.
             pred_score_thr (float): Minimum score of bboxes to draw.
                 Defaults to 0.3.
+            no_save_vis (bool): Whether to save visualization results.
             img_out_dir (str): Output directory of visualization results.
                 If left as empty, no file will be saved. Defaults to ''.
+            cam_type_dir (str): Camera type directory. Defaults to 'CAM2'.
 
         Returns:
             List[np.ndarray] or None: Returns visualization results only if
diff --git a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
index 35f50c9f14..6717bb18c8 100644
--- a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
@@ -233,6 +233,7 @@ def visualize(self,
             wait_time (float): The interval of show (s). Defaults to 0.
             draw_pred (bool): Whether to draw predicted bounding boxes.
                 Defaults to True.
+            no_save_vis (bool): Whether to save visualization results.
             pred_score_thr (float): Minimum score of bboxes to draw.
                 Defaults to 0.3.
             img_out_dir (str): Output directory of visualization results.

From 93449e31c3e73d814dfd1ca2bb0bf01d13e215d9 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 18 Oct 2023 17:40:21 +0800
Subject: [PATCH 11/11] fix mmcv version

---
 mmdet3d/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmdet3d/__init__.py b/mmdet3d/__init__.py
index 9584e57741..5f7c341783 100644
--- a/mmdet3d/__init__.py
+++ b/mmdet3d/__init__.py
@@ -7,7 +7,7 @@
 from .version import __version__, version_info
 
 mmcv_minimum_version = '2.0.0rc4'
-mmcv_maximum_version = '2.1.0'
+mmcv_maximum_version = '2.2.0'
 mmcv_version = digit_version(mmcv.__version__)
 
 mmengine_minimum_version = '0.8.0'