diff --git a/configs/mmocr/text-detection/text-detection_mrcnn_onnxruntime_dynamic.py b/configs/mmocr/text-detection/text-detection_mrcnn_onnxruntime_dynamic.py new file mode 100644 index 0000000000..947d028fd6 --- /dev/null +++ b/configs/mmocr/text-detection/text-detection_mrcnn_onnxruntime_dynamic.py @@ -0,0 +1,33 @@ +_base_ = ['./text-detection_static.py', '../../_base_/backends/onnxruntime.py'] +onnx_config = dict( + output_names=['dets', 'labels', 'masks'], + dynamic_axes=dict( + input=dict({ + 0: 'batch', + 2: 'height', + 3: 'width' + }), + dets=dict({ + 0: 'batch', + 1: 'num_dets' + }), + labels=dict({ + 0: 'batch', + 1: 'num_dets' + }), + masks=dict({ + 0: 'batch', + 1: 'num_dets', + 2: 'height', + 3: 'width' + }))) +codebase_config = dict( + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1, + export_postprocess_mask=False)) diff --git a/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt-fp16_dynamic-320x320-2240x2240.py b/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt-fp16_dynamic-320x320-2240x2240.py new file mode 100644 index 0000000000..2cd6c220d1 --- /dev/null +++ b/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt-fp16_dynamic-320x320-2240x2240.py @@ -0,0 +1,2 @@ +_base_ = ['./text-detection_mrcnn_tensorrt_dynamic-320x320-2240x2240.py'] +backend_config = dict(common_config=dict(fp16_mode=True)) diff --git a/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt-int8_dynamic-320x320-2240x2240.py b/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt-int8_dynamic-320x320-2240x2240.py new file mode 100644 index 0000000000..f08c95f113 --- /dev/null +++ b/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt-int8_dynamic-320x320-2240x2240.py @@ -0,0 +1,5 @@ +_base_ = ['./text-detection_mrcnn_tensorrt_dynamic-320x320-2240x2240.py'] + +backend_config = dict(common_config=dict(fp16_mode=True, int8_mode=True)) + +calib_config = dict(create_calib=True, calib_file='calib_data.h5') diff --git a/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt_dynamic-320x320-2240x2240.py b/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt_dynamic-320x320-2240x2240.py new file mode 100644 index 0000000000..12a03c8c45 --- /dev/null +++ b/configs/mmocr/text-detection/text-detection_mrcnn_tensorrt_dynamic-320x320-2240x2240.py @@ -0,0 +1,45 @@ +_base_ = ['./text-detection_static.py', '../../_base_/backends/tensorrt.py'] +onnx_config = dict( + output_names=['dets', 'labels', 'masks'], + dynamic_axes=dict( + input=dict({ + 0: 'batch', + 2: 'height', + 3: 'width' + }), + dets=dict({ + 0: 'batch', + 1: 'num_dets' + }), + labels=dict({ + 0: 'batch', + 1: 'num_dets' + }), + masks=dict({ + 0: 'batch', + 1: 'num_dets', + 2: 'height', + 3: 'width' + }))) + +backend_config = dict( + common_config=dict(max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 320, 320], + opt_shape=[1, 3, 600, 800], + max_shape=[1, 3, 2240, 2240]))) + ]) + +codebase_config = dict( + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1, + export_postprocess_mask=False)) diff --git a/configs/mmocr/text-detection/text-detection_mrcnn_torchscript.py b/configs/mmocr/text-detection/text-detection_mrcnn_torchscript.py new file mode 100644 index 0000000000..073560e704 --- /dev/null +++ b/configs/mmocr/text-detection/text-detection_mrcnn_torchscript.py @@ -0,0 +1,18 @@ +_base_ = [ + '../../_base_/torchscript_config.py', + '../../_base_/backends/torchscript.py' +] + +ir_config = dict(input_shape=None, output_names=['dets', 'labels', 'masks']) +codebase_config = dict( + type='mmocr', + task='TextDetection', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1, + export_postprocess_mask=False)) diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-1x32x32-1x32x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-1x32x32-1x32x640.py index 68cb0ea73f..f0a6ea4a86 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-1x32x32-1x32x640.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-1x32x32-1x32x640.py @@ -1,3 +1,4 @@ +# 1 channel input for CRNN models _base_ = [ './text-recognition_dynamic.py', '../../_base_/backends/tensorrt-fp16.py' ] diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-32x32-32x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-32x32-32x640.py index 87a144391e..fe85452ace 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-32x32-32x640.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-32x32-32x640.py @@ -1,3 +1,4 @@ +# 3 channel and 32 height input for SATRN models _base_ = [ './text-recognition_dynamic.py', '../../_base_/backends/tensorrt-fp16.py' ] diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-48x64-48x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-48x64-48x640.py new file mode 100644 index 0000000000..fcbabb63bf --- /dev/null +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-48x64-48x640.py @@ -0,0 +1,14 @@ +# 3 channel and 48 height for SAR models +_base_ = [ + './text-recognition_dynamic.py', '../../_base_/backends/tensorrt-fp16.py' +] +backend_config = dict( + common_config=dict(max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 48, 64], + opt_shape=[1, 3, 48, 64], + max_shape=[1, 3, 48, 640]))) + ]) diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-1x32x32.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-1x32x32.py index d1621913df..b1acd5069e 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-1x32x32.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-1x32x32.py @@ -1,3 +1,4 @@ +# 1 channel input for CRNN models _base_ = [ './text-recognition_static.py', '../../_base_/backends/tensorrt-fp16.py' ] diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-32x128.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-32x128.py new file mode 100644 index 0000000000..bc5865260c --- /dev/null +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-32x128.py @@ -0,0 +1,16 @@ +# ABINet models use static input 32x128 +_base_ = [ + './text-recognition_static.py', '../../_base_/backends/tensorrt-fp16.py' +] + +onnx_config = dict(input_shape=[128, 32]) +backend_config = dict( + common_config=dict(max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 32, 128], + opt_shape=[1, 3, 32, 128], + max_shape=[1, 3, 32, 128]))) + ]) diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-1x32x32-1x32x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-1x32x32-1x32x640.py index 49194a862b..c749c4a5cc 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-1x32x32-1x32x640.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-1x32x32-1x32x640.py @@ -1,3 +1,4 @@ +# 1 channel input for CRNN models _base_ = [ './text-recognition_dynamic.py', '../../_base_/backends/tensorrt-int8.py' ] diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-32x32-32x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-32x32-32x640.py index cebb2674c9..136afc410b 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-32x32-32x640.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-32x32-32x640.py @@ -1,3 +1,4 @@ +# 3 channel and 32 height input for SATRN models _base_ = [ './text-recognition_dynamic.py', '../../_base_/backends/tensorrt-int8.py' ] diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-48x64-48x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-48x64-48x640.py new file mode 100644 index 0000000000..8289b0ce82 --- /dev/null +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-48x64-48x640.py @@ -0,0 +1,14 @@ +# 3 channel and 48 height for SAR models +_base_ = [ + './text-recognition_dynamic.py', '../../_base_/backends/tensorrt-int8.py' +] +backend_config = dict( + common_config=dict(max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 48, 64], + opt_shape=[1, 3, 48, 64], + max_shape=[1, 3, 48, 640]))) + ]) diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_static-1x32x32.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_static-1x32x32.py index df36ce93e6..6fbeabc272 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_static-1x32x32.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_static-1x32x32.py @@ -1,3 +1,4 @@ +# 1 channel input for CRNN models _base_ = [ './text-recognition_static.py', '../../_base_/backends/tensorrt-int8.py' ] diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_static-32x128.py b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_static-32x128.py new file mode 100644 index 0000000000..64e8954c3a --- /dev/null +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt-int8_static-32x128.py @@ -0,0 +1,16 @@ +# ABINet models use static input 32x128 +_base_ = [ + './text-recognition_static.py', '../../_base_/backends/tensorrt-int8.py' +] + +onnx_config = dict(input_shape=[128, 32]) +backend_config = dict( + common_config=dict(max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 32, 128], + opt_shape=[1, 3, 32, 128], + max_shape=[1, 3, 32, 128]))) + ]) diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-1x32x32-1x32x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-1x32x32-1x32x640.py index 6fca1265a3..795b1566d6 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-1x32x32-1x32x640.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-1x32x32-1x32x640.py @@ -1,3 +1,4 @@ +# 1 channel input for CRNN models _base_ = ['./text-recognition_dynamic.py', '../../_base_/backends/tensorrt.py'] backend_config = dict( common_config=dict(max_workspace_size=1 << 30), diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-32x32-32x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-32x32-32x640.py index 4f26716e13..2f6c98a61b 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-32x32-32x640.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-32x32-32x640.py @@ -1,3 +1,4 @@ +# 3 channel and 32 height input for SATRN models _base_ = ['./text-recognition_dynamic.py', '../../_base_/backends/tensorrt.py'] backend_config = dict( common_config=dict(max_workspace_size=1 << 30), diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-48x64-48x640.py b/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-48x64-48x640.py new file mode 100644 index 0000000000..932470d35b --- /dev/null +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-48x64-48x640.py @@ -0,0 +1,12 @@ +# 3 channel and 48 height for SAR models +_base_ = ['./text-recognition_dynamic.py', '../../_base_/backends/tensorrt.py'] +backend_config = dict( + common_config=dict(max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 48, 64], + opt_shape=[1, 3, 48, 64], + max_shape=[1, 3, 48, 640]))) + ]) diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt_static-1x32x32.py b/configs/mmocr/text-recognition/text-recognition_tensorrt_static-1x32x32.py index a7e653c8a6..9a4b122c59 100644 --- a/configs/mmocr/text-recognition/text-recognition_tensorrt_static-1x32x32.py +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt_static-1x32x32.py @@ -1,3 +1,4 @@ +# 1 channel input for CRNN models _base_ = ['./text-recognition_static.py', '../../_base_/backends/tensorrt.py'] onnx_config = dict(input_shape=[32, 32]) diff --git a/configs/mmocr/text-recognition/text-recognition_tensorrt_static-32x128.py b/configs/mmocr/text-recognition/text-recognition_tensorrt_static-32x128.py new file mode 100644 index 0000000000..9b7029ff66 --- /dev/null +++ b/configs/mmocr/text-recognition/text-recognition_tensorrt_static-32x128.py @@ -0,0 +1,14 @@ +# ABINet models use static input 32x128 +_base_ = ['./text-recognition_static.py', '../../_base_/backends/tensorrt.py'] + +onnx_config = dict(input_shape=[128, 32]) +backend_config = dict( + common_config=dict(max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 32, 128], + opt_shape=[1, 3, 32, 128], + max_shape=[1, 3, 32, 128]))) + ]) diff --git a/docs/en/03-benchmark/benchmark.md b/docs/en/03-benchmark/benchmark.md index ed7604f72a..18ef2faa3b 100644 --- a/docs/en/03-benchmark/benchmark.md +++ b/docs/en/03-benchmark/benchmark.md @@ -1178,6 +1178,42 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](../ 0.7949 0.7950 + + DBNetpp + TextDetection + ICDAR2015 + recall + 0.8209 + 0.8209 + 0.8209 + 0.8199 + 0.8204 + 0.8204 + - + 0.8209 + + + precision + 0.9079 + 0.9079 + 0.9079 + 0.9117 + 0.9117 + 0.9142 + - + 0.9079 + + + hmean + 0.8622 + 0.8622 + 0.8622 + 0.8634 + 0.8637 + 0.8648 + - + 0.8622 + PSENet TextDetection @@ -1250,6 +1286,78 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](../ - 0.7955 + + TextSnake + TextDetection + CTW1500 + recall + 0.8052 + 0.8052 + 0.8052 + 0.8055 + - + - + - + - + + + precision + 0.8535 + 0.8535 + 0.8535 + 0.8538 + - + - + - + - + + + hmean + 0.8286 + 0.8286 + 0.8286 + 0.8290 + - + - + - + - + + + MaskRCNN + TextDetection + ICDAR2015 + recall + 0.7766 + 0.7766 + 0.7766 + 0.7766 + 0.7761 + 0.7670 + - + - + + + precision + 0.8644 + 0.8644 + 0.8644 + 0.8644 + 0.8630 + 0.8705 + - + - + + + hmean + 0.8182 + 0.8182 + 0.8182 + 0.8182 + 0.8172 + 0.8155 + - + - + CRNN TextRecognition @@ -1292,6 +1400,20 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](../ - - + + ABINet + TextRecognition + IIIT5K + acc + 0.9603 + 0.9563 + 0.9563 + 0.9573 + 0.9507 + 0.9510 + - + - + diff --git a/docs/en/03-benchmark/supported_models.md b/docs/en/03-benchmark/supported_models.md index ec459dc3e8..d9b2400f5c 100644 --- a/docs/en/03-benchmark/supported_models.md +++ b/docs/en/03-benchmark/supported_models.md @@ -68,11 +68,15 @@ The table below lists the models that are guaranteed to be exportable to other b | [EDSR](https://github.com/open-mmlab/mmediting/tree/1.x/configs/edsr) | MMEditing | Y | Y | Y | Y | N | Y | N | N | | [RDN](https://github.com/open-mmlab/mmediting/tree/1.x/configs/rdn) | MMEditing | Y | Y | Y | Y | Y | Y | N | N | | [DBNet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet) | MMOCR | Y | Y | Y | Y | Y | Y | Y | N | +| [DBNetpp](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnetpp) | MMOCR | Y | Y | Y | ? | ? | Y | ? | N | | [PANet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/panet) | MMOCR | Y | Y | Y | Y | ? | Y | Y | N | | [PSENet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/psenet) | MMOCR | Y | Y | Y | Y | ? | Y | Y | N | +| [TextSnake](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/textsnake) | MMOCR | Y | Y | Y | Y | ? | ? | ? | N | +| [MaskRCNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/maskrcnn) | MMOCR | Y | Y | Y | ? | ? | ? | ? | N | | [CRNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/crnn) | MMOCR | Y | Y | Y | Y | Y | N | N | N | | [SAR](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/sar) | MMOCR | N | Y | N | N | N | N | N | N | | [SATRN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/satrn) | MMOCR | Y | Y | Y | N | N | N | N | N | +| [ABINet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/abinet) | MMOCR | Y | Y | Y | N | N | N | N | N | | [HRNet](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/backbones.html#hrnet-cvpr-2019) | MMPose | N | Y | Y | Y | N | Y | N | N | | [MSPN](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/backbones.html#mspn-arxiv-2019) | MMPose | N | Y | Y | Y | N | Y | N | N | | [LiteHRNet](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/backbones.html#litehrnet-cvpr-2021) | MMPose | N | Y | Y | N | N | Y | N | N | diff --git a/docs/en/04-supported-codebases/mmocr.md b/docs/en/04-supported-codebases/mmocr.md index 592eca36d1..d07d1d6b95 100644 --- a/docs/en/04-supported-codebases/mmocr.md +++ b/docs/en/04-supported-codebases/mmocr.md @@ -1,16 +1,20 @@ # MMOCR Deployment -- [Installation](#installation) - - [Install mmocr](#install-mmocr) - - [Install mmdeploy](#install-mmdeploy) -- [Convert model](#convert-model) - - [Convert text detection model](#convert-text-detection-model) - - [Convert text recognition model](#convert-text-recognition-model) -- [Model specification](#model-specification) -- [Model Inference](#model-inference) - - [Backend model inference](#backend-model-inference) - - [SDK model inference](#sdk-model-inference) -- [Supported models](#supported-models) +- [MMOCR Deployment](#mmocr-deployment) + - [Installation](#installation) + - [Install mmocr](#install-mmocr) + - [Install mmdeploy](#install-mmdeploy) + - [Convert model](#convert-model) + - [Convert text detection model](#convert-text-detection-model) + - [Convert text recognition model](#convert-text-recognition-model) + - [Model specification](#model-specification) + - [Model Inference](#model-inference) + - [Backend model inference](#backend-model-inference) + - [SDK model inference](#sdk-model-inference) + - [Text detection SDK model inference](#text-detection-sdk-model-inference) + - [Text Recognition SDK model inference](#text-recognition-sdk-model-inference) + - [Supported models](#supported-models) + - [Reminder](#reminder) ______________________________________________________________________ @@ -230,11 +234,29 @@ Besides python API, mmdeploy SDK also provides other FFI (Foreign Function Inter ## Supported models -| Model | Task | TorchScript | OnnxRuntime | TensorRT | ncnn | PPLNN | OpenVINO | -| :---------------------------------------------------------------------------- | :--------------- | :---------: | :---------: | :------: | :--: | :---: | :------: | -| [DBNet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet) | text-detection | Y | Y | Y | Y | Y | Y | -| [PSENet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/psenet) | text-detection | Y | Y | Y | Y | N | Y | -| [PANet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/panet) | text-detection | Y | Y | Y | Y | N | Y | -| [CRNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/crnn) | text-recognition | Y | Y | Y | Y | Y | N | -| [SAR](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/sar) | text-recognition | N | Y | N | N | N | N | -| [SATRN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/satrn) | text-recognition | Y | Y | Y | N | N | N | +| Model | Task | TorchScript | OnnxRuntime | TensorRT | ncnn | PPLNN | OpenVINO | +| :---------------------------------------------------------------------------------- | :--------------- | :---------: | :---------: | :------: | :--: | :---: | :------: | +| [DBNet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet) | text-detection | Y | Y | Y | Y | Y | Y | +| [DBNetpp](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnetpp) | text-detection | N | Y | Y | ? | ? | Y | +| [PSENet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/psenet) | text-detection | Y | Y | Y | Y | N | Y | +| [PANet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/panet) | text-detection | Y | Y | Y | Y | N | Y | +| [TextSnake](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/textsnake) | text-detection | Y | Y | Y | ? | ? | ? | +| [MaskRCNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/maskrcnn) | text-detection | Y | Y | Y | ? | ? | ? | +| [CRNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/crnn) | text-recognition | Y | Y | Y | Y | Y | N | +| [SAR](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/sar) | text-recognition | N | Y | Y | N | N | N | +| [SATRN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/satrn) | text-recognition | Y | Y | Y | N | N | N | +| [ABINet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/abinet) | text-recognition | Y | Y | Y | ? | ? | ? | + +## Reminder + +- ABINet for TensorRT require pytorch1.10+ and TensorRT 8.4+. + +- For TensorRT backend, users have to choose the right config. For example, CRNN only accepts 1 channel input. Here is a recommendation table: + + | Model | Config | + | :------- | :--------------------------------------------------------- | + | MaskRCNN | text-detection_mrcnn_tensorrt_dynamic-320x320-2240x2240.py | + | CRNN | text-recognition_tensorrt_dynamic-1x32x32-1x32x640.py | + | SATRN | text-recognition_tensorrt_dynamic-32x32-32x640.py | + | SAR | text-recognition_tensorrt_dynamic-48x64-48x640.py | + | ABINet | text-recognition_tensorrt_static-32x128.py | diff --git a/docs/zh_cn/03-benchmark/benchmark.md b/docs/zh_cn/03-benchmark/benchmark.md index 04d3e5e067..1e1a0c5d10 100644 --- a/docs/zh_cn/03-benchmark/benchmark.md +++ b/docs/zh_cn/03-benchmark/benchmark.md @@ -1173,6 +1173,42 @@ GPU: ncnn, TensorRT, PPLNN 0.7949 0.7950 + + DBNetpp + TextDetection + ICDAR2015 + recall + 0.8209 + 0.8209 + 0.8209 + 0.8199 + 0.8204 + 0.8204 + - + 0.8209 + + + precision + 0.9079 + 0.9079 + 0.9079 + 0.9117 + 0.9117 + 0.9142 + - + 0.9079 + + + hmean + 0.8622 + 0.8622 + 0.8622 + 0.8634 + 0.8637 + 0.8648 + - + 0.8622 + PSENet TextDetection @@ -1245,6 +1281,78 @@ GPU: ncnn, TensorRT, PPLNN - 0.7955 + + TextSnake + TextDetection + CTW1500 + recall + 0.8052 + 0.8052 + 0.8052 + 0.8055 + - + - + - + - + + + precision + 0.8535 + 0.8535 + 0.8535 + 0.8538 + - + - + - + - + + + hmean + 0.8286 + 0.8286 + 0.8286 + 0.8290 + - + - + - + - + + + MaskRCNN + TextDetection + ICDAR2015 + recall + 0.7766 + 0.7766 + 0.7766 + 0.7766 + 0.7761 + 0.7670 + - + - + + + precision + 0.8644 + 0.8644 + 0.8644 + 0.8644 + 0.8630 + 0.8705 + - + - + + + hmean + 0.8182 + 0.8182 + 0.8182 + 0.8182 + 0.8172 + 0.8155 + - + - + CRNN TextRecognition @@ -1287,6 +1395,20 @@ GPU: ncnn, TensorRT, PPLNN - - + + ABINet + TextRecognition + IIIT5K + acc + 0.9603 + 0.9563 + 0.9563 + 0.9573 + 0.9507 + 0.9510 + - + - + diff --git a/docs/zh_cn/03-benchmark/supported_models.md b/docs/zh_cn/03-benchmark/supported_models.md index a58071df5a..a9eb83b747 100644 --- a/docs/zh_cn/03-benchmark/supported_models.md +++ b/docs/zh_cn/03-benchmark/supported_models.md @@ -68,11 +68,15 @@ | [EDSR](https://github.com/open-mmlab/mmediting/tree/1.x/configs/edsr) | MMEditing | Y | Y | Y | Y | N | Y | N | N | | [RDN](https://github.com/open-mmlab/mmediting/tree/1.x/configs/rdn) | MMEditing | Y | Y | Y | Y | Y | Y | N | N | | [DBNet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet) | MMOCR | Y | Y | Y | Y | Y | Y | Y | N | +| [DBNetpp](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnetpp) | MMOCR | Y | Y | Y | ? | ? | Y | ? | N | | [PANet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/panet) | MMOCR | Y | Y | Y | Y | ? | Y | Y | N | | [PSENet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/psenet) | MMOCR | Y | Y | Y | Y | ? | Y | Y | N | +| [TextSnake](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/textsnake) | MMOCR | Y | Y | Y | Y | ? | ? | ? | N | +| [MaskRCNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/maskrcnn) | MMOCR | Y | Y | Y | ? | ? | ? | ? | N | | [CRNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/crnn) | MMOCR | Y | Y | Y | Y | Y | N | N | N | | [SAR](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/sar) | MMOCR | N | Y | N | N | N | N | N | N | | [SATRN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/satrn) | MMOCR | Y | Y | Y | N | N | N | N | N | +| [ABINet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/abinet) | MMOCR | Y | Y | Y | N | N | N | N | N | | [HRNet](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/backbones.html#hrnet-cvpr-2019) | MMPose | N | Y | Y | Y | N | Y | N | N | | [MSPN](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/backbones.html#mspn-arxiv-2019) | MMPose | N | Y | Y | Y | N | Y | N | N | | [LiteHRNet](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/backbones.html#litehrnet-cvpr-2021) | MMPose | N | Y | Y | N | N | Y | N | N | diff --git a/docs/zh_cn/04-supported-codebases/mmocr.md b/docs/zh_cn/04-supported-codebases/mmocr.md index 73aea75d48..ff970f4b5b 100644 --- a/docs/zh_cn/04-supported-codebases/mmocr.md +++ b/docs/zh_cn/04-supported-codebases/mmocr.md @@ -1,18 +1,20 @@ # MMOCR 模型部署 -- [安装](#安装) - - [安装 mmocr](#安装-mmocr) - - [安装 mmdeploy](#安装-mmdeploy) -- [模型转换](#模型转换) - - [文字检测任务模型转换](#文字检测任务模型转换) - - [文字识别任务模型转换](#文字识别任务模型转换) -- [模型规范](#模型规范) -- [模型推理](#模型推理) - - [后端模型推理](#后端模型推理) - - [SDK 模型推理](#sdk-模型推理) - - [文字检测 SDK 模型推理](#文字检测-sdk-模型推理) - - [文字识别 SDK 模型推理](#文字识别-sdk-模型推理) -- [模型支持列表](#模型支持列表) +- [MMOCR 模型部署](#mmocr-模型部署) + - [安装](#安装) + - [安装 mmocr](#安装-mmocr) + - [安装 mmdeploy](#安装-mmdeploy) + - [模型转换](#模型转换) + - [文字检测任务模型转换](#文字检测任务模型转换) + - [文字识别任务模型转换](#文字识别任务模型转换) + - [模型规范](#模型规范) + - [模型推理](#模型推理) + - [后端模型推理](#后端模型推理) + - [SDK 模型推理](#sdk-模型推理) + - [文字检测 SDK 模型推理](#文字检测-sdk-模型推理) + - [文字识别 SDK 模型推理](#文字识别-sdk-模型推理) + - [模型支持列表](#模型支持列表) + - [注意事项](#注意事项) ______________________________________________________________________ @@ -236,11 +238,29 @@ print(texts) ## 模型支持列表 -| Model | Task | TorchScript | OnnxRuntime | TensorRT | ncnn | PPLNN | OpenVINO | -| :---------------------------------------------------------------------------- | :--------------- | :---------: | :---------: | :------: | :--: | :---: | :------: | -| [DBNet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet) | text-detection | Y | Y | Y | Y | Y | Y | -| [PSENet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/psenet) | text-detection | Y | Y | Y | Y | N | Y | -| [PANet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/panet) | text-detection | Y | Y | Y | Y | N | Y | -| [CRNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/crnn) | text-recognition | Y | Y | Y | Y | Y | N | -| [SAR](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/sar) | text-recognition | N | Y | N | N | N | N | -| [SATRN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/satrn) | text-recognition | Y | Y | Y | N | N | N | +| Model | Task | TorchScript | OnnxRuntime | TensorRT | ncnn | PPLNN | OpenVINO | +| :---------------------------------------------------------------------------------- | :--------------- | :---------: | :---------: | :------: | :--: | :---: | :------: | +| [DBNet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet) | text-detection | Y | Y | Y | Y | Y | Y | +| [DBNetpp](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnetpp) | text-detection | N | Y | Y | ? | ? | Y | +| [PSENet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/psenet) | text-detection | Y | Y | Y | Y | N | Y | +| [PANet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/panet) | text-detection | Y | Y | Y | Y | N | Y | +| [TextSnake](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/textsnake) | text-detection | Y | Y | Y | ? | ? | ? | +| [MaskRCNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/maskrcnn) | text-detection | Y | Y | Y | ? | ? | ? | +| [CRNN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/crnn) | text-recognition | Y | Y | Y | Y | Y | N | +| [SAR](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/sar) | text-recognition | N | Y | Y | N | N | N | +| [SATRN](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/satrn) | text-recognition | Y | Y | Y | N | N | N | +| [ABINet](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textrecog/abinet) | text-recognition | Y | Y | Y | ? | ? | ? | + +## 注意事项 + +- ABINet 在 TensorRT 后端要求使用 pytorch1.10+, TensorRT 8.4+。 + +- 对于 TensorRT 后端,用户需要使用正确的配置文件。比如 CRNN 只接受单通道输入。下面是一个示例表格: + + | Model | Config | + | :------- | :--------------------------------------------------------- | + | MaskRCNN | text-detection_mrcnn_tensorrt_dynamic-320x320-2240x2240.py | + | CRNN | text-recognition_tensorrt_dynamic-1x32x32-1x32x640.py | + | SATRN | text-recognition_tensorrt_dynamic-32x32-32x640.py | + | SAR | text-recognition_tensorrt_dynamic-48x64-48x640.py | + | ABINet | text-recognition_tensorrt_static-32x128.py | diff --git a/mmdeploy/codebase/mmocr/deploy/text_detection.py b/mmdeploy/codebase/mmocr/deploy/text_detection.py index 5d2f7e720a..8305199e0d 100644 --- a/mmdeploy/codebase/mmocr/deploy/text_detection.py +++ b/mmdeploy/codebase/mmocr/deploy/text_detection.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy from typing import Callable, Dict, Optional, Sequence, Tuple, Union import mmengine @@ -98,6 +99,41 @@ def __init__(self, model_cfg: mmengine.Config, deploy_cfg: mmengine.Config, device: str): super(TextDetection, self).__init__(model_cfg, deploy_cfg, device) + def build_pytorch_model(self, + model_checkpoint: Optional[str] = None, + cfg_options: Optional[Dict] = None, + **kwargs) -> torch.nn.Module: + """Initialize torch model. + + Args: + model_checkpoint (str): The checkpoint file of torch model, + defaults to `None`. + cfg_options (dict): Optional config key-pair parameters. + + Returns: + nn.Module: An initialized torch model generated by other OpenMMLab + codebases. + """ + from mmengine.model import revert_sync_batchnorm + from mmengine.registry import MODELS + + model = deepcopy(self.model_cfg.model) + preprocess_cfg = deepcopy(self.model_cfg.get('preprocess_cfg', {})) + preprocess_cfg.update( + deepcopy(self.model_cfg.get('data_preprocessor', {}))) + model.setdefault('data_preprocessor', preprocess_cfg) + if model.type == 'MMDetWrapper': # Mask-RCNN in MMOCR + model = deepcopy(self.model_cfg.model) + model = MODELS.build(model) + if model_checkpoint is not None: + from mmengine.runner.checkpoint import load_checkpoint + load_checkpoint(model, model_checkpoint, map_location=self.device) + + model = revert_sync_batchnorm(model) + model = model.to(self.device) + model.eval() + return model + def build_backend_model(self, model_files: Optional[str] = None, **kwargs) -> torch.nn.Module: @@ -225,7 +261,12 @@ def get_preprocess(self, *args, **kwargs) -> Dict: if transform['type'] == 'Resize': transforms[i]['size'] = transforms[i].pop('scale') - data_preprocessor = model_cfg.model.data_preprocessor + if 'data_preprocessor' in model_cfg.model: + data_preprocessor = model_cfg.model.data_preprocessor + elif 'MMDetWrapper' == self.model_cfg.model.type: + data_preprocessor = model_cfg.model.cfg.data_preprocessor + else: + raise ValueError(f'Unsupported model config {model_cfg.model} ') transforms.insert(-1, dict(type='DefaultFormatBundle')) transforms.insert( -2, @@ -247,7 +288,20 @@ def get_postprocess(self, *args, **kwargs) -> Dict: Return: dict: Composed of the postprocess information. """ - postprocess = self.model_cfg.model.det_head + if 'det_head' in self.model_cfg.model: + postprocess = self.model_cfg.model.det_head + elif 'MMDetWrapper' == self.model_cfg.model.type: + params = self.model_cfg.model.cfg.test_cfg + type = 'ResizeInstanceMask' # default for object detection + if 'rpn' in params: + params['min_bbox_size'] = params['rpn']['min_bbox_size'] + if 'rcnn' in params: + params['score_thr'] = params['rcnn']['score_thr'] + if 'mask_thr_binary' in params['rcnn']: + params['mask_thr_binary'] = params['rcnn'][ + 'mask_thr_binary'] + type = 'ResizeInstanceMask' # for instance-seg + return dict(type=type, params=params) return postprocess def get_model_name(self, *args, **kwargs) -> str: diff --git a/mmdeploy/codebase/mmocr/deploy/text_detection_model.py b/mmdeploy/codebase/mmocr/deploy/text_detection_model.py index da5d8720f5..de0c407666 100644 --- a/mmdeploy/codebase/mmocr/deploy/text_detection_model.py +++ b/mmdeploy/codebase/mmocr/deploy/text_detection_model.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import List, Optional, Sequence, Union +import cv2 import mmengine import torch from mmengine.registry import Registry @@ -36,14 +37,22 @@ def __init__( model_cfg: Optional[mmengine.Config] = None, **kwargs, ): + data_preprocessor = model_cfg.model.get('data_preprocessor', {}) + if data_preprocessor is not None: # skip when it is SDKEnd2EndModel + data_preprocessor.update( + model_cfg.model.get('cfg', {}).get('data_preprocessor', {})) + if data_preprocessor.get('type', None) == 'DetDataPreprocessor': + data_preprocessor.update(_scope_='mmdet') # MRCNN super(End2EndModel, self).__init__( - deploy_cfg=deploy_cfg, - data_preprocessor=model_cfg.model.data_preprocessor) + deploy_cfg=deploy_cfg, data_preprocessor=data_preprocessor) self.deploy_cfg = deploy_cfg self.show_score = False from mmocr.registry import MODELS - self.det_head = MODELS.build(model_cfg.model.det_head) + if hasattr(model_cfg.model, 'det_head'): + self.det_head = MODELS.build(model_cfg.model.det_head) + else: + self.text_repr_type = model_cfg.model.get('text_repr_type', 'poly') self._init_wrapper( backend=backend, backend_files=backend_files, @@ -101,7 +110,79 @@ def forward(self, instance, in (xn, yn) order. """ x = self.extract_feat(inputs) - return self.det_head.postprocessor(x[0], data_samples) + if hasattr(self, 'det_head'): + return self.det_head.postprocessor(x[0], data_samples) + # post-process of mmdet models + from mmdet.structures.mask import bitmap_to_polygon + from mmocr.utils.bbox_utils import bbox2poly + + from mmdeploy.codebase.mmdet.deploy import get_post_processing_params + from mmdeploy.codebase.mmdet.deploy.object_detection_model import \ + End2EndModel as DetModel + if len(x) == 3: # instance seg + batch_dets, _, batch_masks = x + for i in range(batch_dets.size(0)): + masks = batch_masks[i] + bboxes = batch_dets[i, :, :4] + bboxes[:, ::2] /= data_samples[i].scale_factor[0] + bboxes[:, 1::2] /= data_samples[i].scale_factor[1] + ori_h, ori_w = data_samples[i].ori_shape[:2] + img_h, img_w = data_samples[i].img_shape[:2] + export_postprocess_mask = True + polygons = [] + scores = [] + if self.deploy_cfg is not None: + codebase_cfg = get_post_processing_params(self.deploy_cfg) + # this flag enable postprocess when export. + export_postprocess_mask = codebase_cfg.get( + 'export_postprocess_mask', True) + if not export_postprocess_mask: + masks = DetModel.postprocessing_masks( + bboxes, masks, ori_w, ori_h, batch_masks.device) + else: + masks = masks[:, :img_h, :img_w] + masks = torch.nn.functional.interpolate( + masks.unsqueeze(0).float(), size=(ori_h, ori_w)) + masks = masks.squeeze(0) + if masks.dtype != bool: + masks = masks >= 0.5 + + for mask_idx, mask in enumerate(masks.cpu()): + contours, _ = bitmap_to_polygon(mask) + polygons += [contour.reshape(-1) for contour in contours] + scores += [batch_dets[i, :, 4][mask_idx].cpu() + ] * len(contours) + # filter invalid polygons + filterd_polygons = [] + keep_idx = [] + for poly_idx, polygon in enumerate(polygons): + if len(polygon) < 6: + continue + filterd_polygons.append(polygon) + keep_idx.append(poly_idx) + # convert by text_repr_type + if self.text_repr_type == 'quad': + for j, poly in enumerate(filterd_polygons): + rect = cv2.minAreaRect(poly) + vertices = cv2.boxPoints(rect) + poly = vertices.flatten() + filterd_polygons[j] = poly + pred_instances = InstanceData() + pred_instances.polygons = filterd_polygons + pred_instances.scores = torch.FloatTensor(scores)[keep_idx] + data_samples[i].pred_instances = pred_instances + else: + dets = x[0] + for i in range(dets.size(0)): + bboxes = dets[i, :, :4].cpu().numpy() + bboxes[:, ::2] /= data_samples[i].scale_factor[0] + bboxes[:, 1::2] /= data_samples[i].scale_factor[1] + polygons = [bbox2poly(bbox) for bbox in bboxes] + pred_instances = InstanceData() + pred_instances.polygons = polygons + pred_instances.scores = torch.FloatTensor(dets[i, :, 4].cpu()) + data_samples[i].pred_instances = pred_instances + return data_samples def extract_feat(self, batch_inputs: torch.Tensor) -> torch.Tensor: """The interface for forward test. diff --git a/mmdeploy/codebase/mmocr/models/text_detection/__init__.py b/mmdeploy/codebase/mmocr/models/text_detection/__init__.py index abaed26541..91cd1655f3 100644 --- a/mmdeploy/codebase/mmocr/models/text_detection/__init__.py +++ b/mmdeploy/codebase/mmocr/models/text_detection/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from . import fpn_cat # noqa: F401,F403 from . import heads # noqa: F401,F403 +from . import mmdet_wrapper # noqa: F401,F403 from . import single_stage_text_detector # noqa: F401,F403 diff --git a/mmdeploy/codebase/mmocr/models/text_detection/heads.py b/mmdeploy/codebase/mmocr/models/text_detection/heads.py index d23c957ff4..3c408de304 100644 --- a/mmdeploy/codebase/mmocr/models/text_detection/heads.py +++ b/mmdeploy/codebase/mmocr/models/text_detection/heads.py @@ -2,7 +2,7 @@ from typing import Dict import torch -from mmocr.utils.typing_utils import DetSampleList +from mmocr.utils import DetSampleList from mmdeploy.core import FUNCTION_REWRITER diff --git a/mmdeploy/codebase/mmocr/models/text_detection/mmdet_wrapper.py b/mmdeploy/codebase/mmocr/models/text_detection/mmdet_wrapper.py new file mode 100644 index 0000000000..67adc3a45c --- /dev/null +++ b/mmdeploy/codebase/mmocr/models/text_detection/mmdet_wrapper.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import torch +from mmdet.structures import DetDataSample +from mmdet.structures import SampleList as MMDET_SampleList +from mmocr.structures import TextDetDataSample +from mmocr.utils.typing_utils import DetSampleList + +from mmdeploy.core import FUNCTION_REWRITER + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmocr.models.textdet.detectors.MMDetWrapper.forward') +def mmdet_wrapper__forward(self, + inputs: torch.Tensor, + data_samples: Optional[Union[ + DetSampleList, MMDET_SampleList]] = None, + mode: str = 'tensor', + **kwargs) -> Sequence[TextDetDataSample]: + """The unified entry for a forward process in both training and test. + + The method works in three modes: "tensor", "predict" and "loss": + + - "tensor": Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - "predict": Forward and return the predictions, which are fully + processed to a list of :obj:`DetDataSample`. + - "loss": Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle either back propagation or + parameter update, which are supposed to be done in :meth:`train_step`. + + Args: + inputs (torch.Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (list[:obj:`DetDataSample`] or + list[:obj:`TextDetDataSample`]): The annotation data of every + sample. When in "predict" mode, it should be a list of + :obj:`TextDetDataSample`. Otherwise they are + :obj:`DetDataSample`s. Defaults to None. + mode (str): Running mode. Defaults to 'tensor'. + + Returns: + results (Sequence(torch.Tensor)): Output of MMDet models. + """ + if mode == 'predict': + ocr_data_samples = data_samples + data_samples = [] + for i in range(len(ocr_data_samples)): + data_samples.append( + DetDataSample(metainfo=ocr_data_samples[i].metainfo)) + + results = self.wrapped_model.forward(inputs, data_samples, mode, **kwargs) + return results diff --git a/mmdeploy/codebase/mmocr/models/text_recognition/__init__.py b/mmdeploy/codebase/mmocr/models/text_recognition/__init__.py index f3ed1e7976..b2991af218 100644 --- a/mmdeploy/codebase/mmocr/models/text_recognition/__init__.py +++ b/mmdeploy/codebase/mmocr/models/text_recognition/__init__.py @@ -1,7 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +from . import abi_language_decoder # noqa: F401,F403 from . import base_decoder # noqa: F401,F403 from . import crnn_decoder # noqa: F401,F403 from . import encoder_decoder_recognizer # noqa: F401,F403 from . import lstm_layer # noqa: F401,F403 from . import sar_decoder # noqa: F401,F403 from . import sar_encoder # noqa: F401,F403 +from . import transformer_module # noqa: F401,F403 diff --git a/mmdeploy/codebase/mmocr/models/text_recognition/abi_language_decoder.py b/mmdeploy/codebase/mmocr/models/text_recognition/abi_language_decoder.py new file mode 100644 index 0000000000..3e5d242297 --- /dev/null +++ b/mmdeploy/codebase/mmocr/models/text_recognition/abi_language_decoder.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmdeploy.core import FUNCTION_REWRITER +from mmdeploy.utils import IR + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmocr.models.textrecog.decoders.ABILanguageDecoder._get_length', + IR=IR.ONNX) +def abi_language_decoder___get_length__default(self, + logit: torch.Tensor, + dim: int = -1, + **kwargs) -> torch.Tensor: + """Rewrite `_get_length`. Add `.float()` to cast Tensors from bool to float + for `cumsum` and `argmax`. + + Returns the first location of padding index or the length of the entire + tensor otherwise. + """ + # out as a boolean vector indicating the existence of end token(s) + out = (logit.argmax(dim=-1) == self.dictionary.end_idx) + abn = out.any(dim) + # Get the first index of end token + # add `.float()` to `out` for onnxruntime `cumsum()` + # add `.float()` before `argmax()` + out = ((out.float().cumsum(dim) == 1) & out).float().argmax(dim) + out = out + 1 + out = torch.where(abn, out, + out.new_tensor(logit.shape[1]).to(out.device)).float() + return out diff --git a/mmdeploy/codebase/mmocr/models/text_recognition/transformer_module.py b/mmdeploy/codebase/mmocr/models/text_recognition/transformer_module.py new file mode 100644 index 0000000000..d5cfad4de7 --- /dev/null +++ b/mmdeploy/codebase/mmocr/models/text_recognition/transformer_module.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn + +from mmdeploy.core import MODULE_REWRITER + + +@MODULE_REWRITER.register_rewrite_module( + 'mmocr.models.common.modules.PositionalEncoding', backend='default') +class PositionalEncoding(nn.Module): + """Rewrite Position Encoding module in `ABINet.""" + + def __init__(self, module, deploy_cfg, **kwargs): + super(PositionalEncoding, self).__init__() + self._module = module + self.deploy_cfg = deploy_cfg + self.n_position = module.position_table.size(1) + self.d_hid = module.position_table.size(2) + + def _get_sinusoid_encoding_table(self, n_position, d_hid, device): + """Sinusoid position encoding table.""" + denominator = torch.Tensor([ + 1.0 / torch.tensor(10000).to(device).pow( + torch.tensor(2 * (hid_j // 2) / d_hid)).to(device) + for hid_j in range(d_hid) + ]).to(device) + denominator = denominator.view(1, -1) + pos_tensor = torch.arange(n_position).to(device).unsqueeze(-1).float() + sinusoid_table = pos_tensor * denominator + sinusoid_table[:, 0::2] = torch.sin(sinusoid_table[:, 0::2]) + sinusoid_table[:, 1::2] = torch.cos(sinusoid_table[:, 1::2]) + + return sinusoid_table.unsqueeze(0) + + def forward(self, x): + """ + Args: + x (Tensor): Tensor of shape (batch_size, pos_len, d_hid, ...) + """ + device = x.device + position_table = self._get_sinusoid_encoding_table( + self.n_position, self.d_hid, device) + x = x + position_table[:, :x.size(1), ...] + return x diff --git a/mmdeploy/pytorch/functions/__init__.py b/mmdeploy/pytorch/functions/__init__.py index a07a2aab06..5ee8ef348d 100644 --- a/mmdeploy/pytorch/functions/__init__.py +++ b/mmdeploy/pytorch/functions/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from . import adaptive_pool # noqa: F401,F403 +from . import any # noqa: F401,F403 from . import atan2 # noqa: F401,F403 from . import chunk # noqa: F401,F403 from . import clip # noqa: F401,F403 diff --git a/mmdeploy/pytorch/functions/any.py b/mmdeploy/pytorch/functions/any.py new file mode 100644 index 0000000000..469b7c327b --- /dev/null +++ b/mmdeploy/pytorch/functions/any.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch + +from mmdeploy.core import FUNCTION_REWRITER + + +@FUNCTION_REWRITER.register_rewriter(func_name='torch.Tensor.any') +@FUNCTION_REWRITER.register_rewriter(func_name='torch.any') +def any__default(input: torch.Tensor, + dim: Optional[str] = None, + keepdim: bool = False, + **kwargs) -> torch.Tensor: + """Rewrite `any` for ONNX.""" + if dim is None and keepdim is False: + return (input != 0).sum() > 0 + + return (input != 0).sum(dim, keepdim=keepdim) > 0 diff --git a/tests/regression/mmocr.yml b/tests/regression/mmocr.yml index 71df518d42..16ce8c5a18 100644 --- a/tests/regression/mmocr.yml +++ b/tests/regression/mmocr.yml @@ -34,6 +34,10 @@ onnxruntime: convert_image: *convert_image_det deploy_config: configs/mmocr/text-detection/text-detection_onnxruntime_dynamic.py + pipeline_ort_detection_mrcnn_dynamic_fp32: &pipeline_ort_detection_mrcnn_dynamic_fp32 + convert_image: *convert_image_det + deploy_config: configs/mmocr/text-detection/text-detection_mrcnn_onnxruntime_dynamic.py + # ======= recognition ======= pipeline_ort_recognition_static_fp32: &pipeline_ort_recognition_static_fp32 convert_image: *convert_image_rec @@ -69,12 +73,24 @@ tensorrt: sdk_config: *sdk_detection_dynamic deploy_config: configs/mmocr/text-detection/text-detection_tensorrt_dynamic-320x320-2240x2240.py + pipeline_trt_detection_mrcnn_dynamic_fp32: &pipeline_trt_detection_mrcnn_dynamic_fp32 + convert_image: *convert_image_det + backend_test: *default_backend_test + sdk_config: *sdk_detection_dynamic + deploy_config: configs/mmocr/text-detection/text-detection_mrcnn_tensorrt_dynamic-320x320-2240x2240.py + pipeline_trt_detection_dynamic_fp16: &pipeline_trt_detection_dynamic_fp16 convert_image: *convert_image_det backend_test: *default_backend_test sdk_config: *sdk_detection_dynamic deploy_config: configs/mmocr/text-detection/text-detection_tensorrt-fp16_dynamic-320x320-2240x2240.py + pipeline_trt_detection_mrcnn_dynamic_fp16: &pipeline_trt_detection_mrcnn_dynamic_fp16 + convert_image: *convert_image_det + backend_test: *default_backend_test + sdk_config: *sdk_detection_dynamic + deploy_config: configs/mmocr/text-detection/text-detection_mrcnn_tensorrt-fp16_dynamic-320x320-2240x2240.py + pipeline_trt_detection_dynamic_int8: &pipeline_trt_detection_dynamic_int8 convert_image: *convert_image_det backend_test: *default_backend_test @@ -82,42 +98,82 @@ tensorrt: deploy_config: configs/mmocr/text-detection/text-detection_tensorrt-int8_dynamic-320x320-2240x2240.py # ======= recognition ======= - pipeline_trt_recognition_static_fp32: &pipeline_trt_recognition_static_fp32 + pipeline_trt_recognition_static_fp32_C1: &pipeline_trt_recognition_static_fp32_C1 convert_image: *convert_image_rec backend_test: *default_backend_test sdk_config: *sdk_recognition_dynamic deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt_static-1x32x32.py - pipeline_trt_recognition_static_fp16: &pipeline_trt_recognition_static_fp16 + # ABINet models with static shape 32x128 + pipeline_trt_recognition_static_fp32_C3: &pipeline_trt_recognition_static_fp32_C3 + convert_image: *convert_image_rec + backend_test: *default_backend_test + sdk_config: *sdk_recognition_dynamic + deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt_static-32x128.py + + pipeline_trt_recognition_static_fp16_C3: &pipeline_trt_recognition_static_fp16_C3 + convert_image: *convert_image_rec + backend_test: *default_backend_test + sdk_config: *sdk_recognition_dynamic + deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-32x128.py + + # SAR models with height 48 and channel 3 + pipeline_trt_recognition_dynamic_fp32_H48_C3: &pipeline_trt_recognition_dynamic_fp32_H48_C3 + convert_image: *convert_image_rec + backend_test: *default_backend_test + sdk_config: *sdk_recognition_dynamic + deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-48x64-48x640.py + + pipeline_trt_recognition_dynamic_fp16_H48_C3: &pipeline_trt_recognition_dynamic_fp16_H48_C3 convert_image: *convert_image_rec backend_test: *default_backend_test sdk_config: *sdk_recognition_dynamic - deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_static-1x32x32.py + deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-48x64-48x640.py - pipeline_trt_recognition_static_int8: &pipeline_trt_recognition_static_int8 + pipeline_trt_recognition_dynamic_int8_H48_C3: &pipeline_trt_recognition_dynamic_int8_H48_C3 convert_image: *convert_image_rec backend_test: *default_backend_test sdk_config: *sdk_recognition_dynamic - deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-int8_static-1x32x32.py + deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-48x64-48x640.py - pipeline_trt_recognition_dynamic_fp32: &pipeline_trt_recognition_dynamic_fp32 + # CRNN models with height 32 and channel 1 + pipeline_trt_recognition_dynamic_fp32_H32_C1: &pipeline_trt_recognition_dynamic_fp32_H32_C1 convert_image: *convert_image_rec backend_test: *default_backend_test sdk_config: *sdk_recognition_dynamic deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-1x32x32-1x32x640.py - pipeline_trt_recognition_dynamic_fp16: &pipeline_trt_recognition_dynamic_fp16 + pipeline_trt_recognition_dynamic_fp16_H32_C1: &pipeline_trt_recognition_dynamic_fp16_H32_C1 convert_image: *convert_image_rec backend_test: *default_backend_test sdk_config: *sdk_recognition_dynamic deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-1x32x32-1x32x640.py - pipeline_trt_recognition_dynamic_int8: &pipeline_trt_recognition_dynamic_int8 + pipeline_trt_recognition_dynamic_int8_H32_C1: &pipeline_trt_recognition_dynamic_int8_H32_C1 convert_image: *convert_image_rec backend_test: *default_backend_test sdk_config: *sdk_recognition_dynamic deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-1x32x32-1x32x640.py + # SATRN models with height 32 and channel 3 + pipeline_trt_recognition_dynamic_fp32_H32_C3: &pipeline_trt_recognition_dynamic_fp32_H32_C3 + convert_image: *convert_image_rec + backend_test: *default_backend_test + sdk_config: *sdk_recognition_dynamic + deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt_dynamic-32x32-32x640.py + + pipeline_trt_recognition_dynamic_fp16_H32_C3: &pipeline_trt_recognition_dynamic_fp16_H32_C3 + convert_image: *convert_image_rec + backend_test: *default_backend_test + sdk_config: *sdk_recognition_dynamic + deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-fp16_dynamic-32x32-32x640.py + + pipeline_trt_recognition_dynamic_int8_H32_C3: &pipeline_trt_recognition_dynamic_int8_H32_C3 + convert_image: *convert_image_rec + backend_test: *default_backend_test + sdk_config: *sdk_recognition_dynamic + deploy_config: configs/mmocr/text-recognition/text-recognition_tensorrt-int8_dynamic-32x32-32x640.py + openvino: pipeline_openvino_detection_dynamic_fp32: &pipeline_openvino_detection_dynamic_fp32 convert_image: *convert_image_det @@ -157,6 +213,11 @@ torchscript: backend_test: False deploy_config: configs/mmocr/text-detection/text-detection_torchscript.py + pipeline_ts_detection_mrcnn_fp32: &pipeline_ts_detection_mrcnn_fp32 + convert_image: *convert_image_det + backend_test: False + deploy_config: configs/mmocr/text-detection/text-detection_mrcnn_torchscript.py + pipeline_ts_recognition_fp32: &pipeline_ts_recognition_fp32 convert_image: *convert_image_rec backend_test: False @@ -176,6 +237,16 @@ models: - *pipeline_pplnn_detection_dynamic_fp32 - *pipeline_openvino_detection_dynamic_fp32 + - name: DBNetpp + metafile: configs/textdet/dbnetpp/metafile.yml + model_configs: + - configs/textdet/dbnetpp/dbnetpp_resnet50_fpnc_1200e_icdar2015.py + pipelines: + - *pipeline_ort_detection_dynamic_fp32 + - *pipeline_trt_detection_dynamic_fp16 + - *pipeline_ncnn_detection_static_fp32 + - *pipeline_openvino_detection_dynamic_fp32 + - name: PANet metafile: configs/textdet/panet/metafile.yml model_configs: @@ -188,6 +259,36 @@ models: - *pipeline_pplnn_detection_dynamic_fp32 - *pipeline_openvino_detection_dynamic_fp32 + - name: PSENet + metafile: configs/textdet/psenet/metafile.yml + model_configs: + - configs/textdet/psenet/psenet_resnet50_fpnf_600e_icdar2015.py + pipelines: + - *pipeline_ts_detection_fp32 + - *pipeline_ort_detection_dynamic_fp32 + - *pipeline_trt_detection_dynamic_fp16 + - *pipeline_ncnn_detection_static_fp32 + - *pipeline_pplnn_detection_dynamic_fp32 + - *pipeline_openvino_detection_dynamic_fp32 + + - name: TextSnake + metafile: configs/textdet/textsnake/metafile.yml + model_configs: + - configs/textdet/textsnake/textsnake_resnet50_fpn-unet_1200e_ctw1500.py + pipelines: + - *pipeline_ts_detection_fp32 + - *pipeline_ort_detection_dynamic_fp32 + - *pipeline_trt_detection_dynamic_fp32 + + - name: MaskRCNN + metafile: configs/textdet/maskrcnn/metafile.yml + model_configs: + - configs/textdet/maskrcnn/mask-rcnn_resnet50_fpn_160e_icdar2015.py + pipelines: + - *pipeline_ts_detection_mrcnn_fp32 + - *pipeline_ort_detection_mrcnn_dynamic_fp32 + - *pipeline_trt_detection_mrcnn_dynamic_fp32 + - name: CRNN metafile: configs/textrecog/crnn/metafile.yml model_configs: @@ -195,6 +296,33 @@ models: pipelines: - *pipeline_ts_recognition_fp32 - *pipeline_ort_recognition_dynamic_fp32 - - *pipeline_trt_recognition_dynamic_fp16 + - *pipeline_trt_recognition_dynamic_fp16_H32_C1 - *pipeline_ncnn_recognition_static_fp32 - *pipeline_pplnn_recognition_dynamic_fp32 + + - name: SAR + metafile: configs/textrecog/sar/metafile.yml + model_configs: + - configs/textrecog/sar/sar_resnet31_parallel-decoder_5e_st-sub_mj-sub_sa_real.py + pipelines: + - *pipeline_ts_recognition_fp32 + - *pipeline_ort_recognition_dynamic_fp32 + - *pipeline_trt_recognition_dynamic_fp32_H48_C3 + + - name: SATRN + metafile: configs/textrecog/satrn/metafile.yml + model_configs: + - configs/textrecog/satrn/satrn_shallow-small_5e_st_mj.py + pipelines: + - *pipeline_ts_recognition_fp32 + - *pipeline_ort_recognition_dynamic_fp32 + - *pipeline_trt_recognition_dynamic_fp32_H32_C3 + + - name: ABINet + metafile: configs/textrecog/abinet/metafile.yml + model_configs: + - configs/textrecog/abinet/abinet_20e_st-an_mj.py + pipelines: + - *pipeline_ts_recognition_fp32 + - *pipeline_ort_recognition_static_fp32 + - *pipeline_trt_recognition_static_fp16_C3 diff --git a/tests/test_codebase/test_mmocr/data/mrcnn.py b/tests/test_codebase/test_mmocr/data/mrcnn.py new file mode 100644 index 0000000000..732c834423 --- /dev/null +++ b/tests/test_codebase/test_mmocr/data/mrcnn.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +model = dict( + type='MMDetWrapper', + text_repr_type='poly', + cfg=dict( + type='MaskRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[4], + ratios=[0.17, 0.44, 1.13, 2.9, 7.46], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=1, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5)), + _scope_='mmdet')) diff --git a/tests/test_codebase/test_mmocr/test_mmocr_models.py b/tests/test_codebase/test_mmocr/test_mmocr_models.py index 87dcd49783..1d07d5f9e6 100644 --- a/tests/test_codebase/test_mmocr/test_mmocr_models.py +++ b/tests/test_codebase/test_mmocr/test_mmocr_models.py @@ -9,6 +9,7 @@ from mmdeploy.codebase import import_codebase from mmdeploy.core import RewriterContext, patch_model from mmdeploy.utils import Backend, Codebase +from mmdeploy.utils.config_utils import load_config from mmdeploy.utils.test import (WrapModel, check_backend, get_model_outputs, get_rewrite_outputs) @@ -22,7 +23,8 @@ dictionary = dict( type='Dictionary', dict_file='tests/test_codebase/test_mmocr/data/lower_english_digits.txt', - with_padding=True) + with_padding=True, + with_end=True) class FPNCNeckModel(FPNC): @@ -408,16 +410,6 @@ def test_sar_model(backend: Backend, decoder_type): sar_cfg.model.pop('type') pytorch_model = SARNet(**(sar_cfg.model)) - # img_meta = { - # 'ori_shape': [48, 160], - # 'img_shape': [48, 160, 3], - # 'scale_factor': [1., 1.] - # } - # from mmengine.structures import InstanceData - # from mmocr.structures import TextRecogDataSample - # pred_instances = InstanceData(metainfo=img_meta) - # data_sample = TextRecogDataSample(pred_instances=pred_instances) - # data_sample.set_metainfo(img_meta) model_inputs = {'inputs': torch.rand(1, 3, 48, 160), 'data_samples': None} deploy_cfg = mmengine.Config( @@ -461,3 +453,121 @@ def test_sar_model(backend: Backend, decoder_type): onnx.checker.check_model(model) except onnx.checker.ValidationError: assert False + + +@pytest.mark.parametrize('backend', [Backend.ONNXRUNTIME]) +def test_mmdet_wrapper__forward(backend): + check_backend(backend) + from mmdet.structures import DetDataSample + from mmengine.structures import InstanceData + from mmocr.models.textdet import MMDetWrapper + cfg, = load_config('tests/test_codebase/test_mmocr/data/mrcnn.py') + + model = MMDetWrapper(cfg.model.cfg) + model.eval() + deploy_cfg = mmengine.Config( + dict( + backend_config=dict( + type=backend.value, + common_config=dict(max_workspace_size=1 << 30)), + onnx_config=dict( + input_shape=None, + input_names=['inputs'], + output_names=['output']), + codebase_config=dict( + type='mmocr', + task='TextDetection', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1, + export_postprocess_mask=False)))) + + input = torch.rand(1, 3, 64, 64) + img_meta = { + 'ori_shape': [64, 64], + 'img_shape': [64, 64], + 'scale_factor': [1., 1.], + 'img_path': '' + } + pred_instances = InstanceData(metainfo=img_meta) + data_sample = DetDataSample(pred_instances=pred_instances) + data_sample.set_metainfo(img_meta) + wrapped_model = WrapModel(model, 'forward', data_samples=[data_sample]) + + rewrite_inputs = {'inputs': input} + + rewrite_outputs, _ = get_rewrite_outputs( + wrapped_model=wrapped_model, + model_inputs=rewrite_inputs, + deploy_cfg=deploy_cfg, + run_with_backend=False) + assert rewrite_outputs is not None + + +@pytest.mark.parametrize('backend', [Backend.ONNXRUNTIME]) +def test_abi_language_decoder___get_length(backend): + check_backend(backend) + from mmocr.models.textrecog.decoders import ABILanguageDecoder + model = ABILanguageDecoder(dictionary=dictionary) + input = torch.randn(1, 26, 37) + model_inputs = {'logit': input} + model_outputs = get_model_outputs(model, '_get_length', model_inputs) + wrapped_model = WrapModel(model, '_get_length') + rewrite_inputs = {'logit': input} + deploy_cfg = mmengine.Config( + dict( + backend_config=dict(type=backend.value), + onnx_config=dict(input_shape=None), + codebase_config=dict( + type='mmocr', + task='TextRecognition', + ))) + rewrite_outputs, is_backend_output = get_rewrite_outputs( + wrapped_model=wrapped_model, + model_inputs=rewrite_inputs, + deploy_cfg=deploy_cfg) + + if is_backend_output: + rewrite_outputs = rewrite_outputs[0] + + model_outputs = model_outputs.float().cpu().numpy() + rewrite_outputs = rewrite_outputs.cpu().numpy() + print(model_outputs, rewrite_outputs) + assert np.allclose(model_outputs, rewrite_outputs, rtol=1e-03, atol=1e-05) + + +@pytest.mark.parametrize('backend', [Backend.ONNXRUNTIME]) +def test__positional_encoding(backend): + check_backend(backend) + from mmocr.models.common.modules import PositionalEncoding + pytorch_model = PositionalEncoding(64, 20) + input = torch.rand(1, 20, 64) + model_inputs = {'x': input} + model_outputs = get_model_outputs(pytorch_model, 'forward', model_inputs) + wrapped_model = WrapModel(pytorch_model, 'forward') + rewrite_inputs = {'x': input} + deploy_cfg = mmengine.Config( + dict( + backend_config=dict(type=backend.value), + onnx_config=dict(input_shape=None), + codebase_config=dict( + type='mmocr', + task='TextRecognition', + ))) + rewrite_outputs, is_backend_output = get_rewrite_outputs( + wrapped_model=wrapped_model, + model_inputs=rewrite_inputs, + deploy_cfg=deploy_cfg) + + if is_backend_output: + rewrite_outputs = rewrite_outputs[0] + + model_outputs = model_outputs.float().cpu().numpy() + rewrite_outputs = rewrite_outputs.cpu().numpy() + print(model_outputs, rewrite_outputs) + assert np.allclose(model_outputs, rewrite_outputs, rtol=1e-03, atol=1e-05) diff --git a/tests/test_pytorch/test_pytorch_functions.py b/tests/test_pytorch/test_pytorch_functions.py index 6b296c9198..245bfba9d4 100644 --- a/tests/test_pytorch/test_pytorch_functions.py +++ b/tests/test_pytorch/test_pytorch_functions.py @@ -557,6 +557,31 @@ def _pad_(x): pytorch_output, rewrite_output[0], rtol=1e-3, atol=1e-5) +@backend_checker(Backend.ONNXRUNTIME) +@pytest.mark.parametrize('dim', [0, -1]) +@pytest.mark.parametrize('keepdim', [True, False]) +def test_any__default(dim, keepdim): + input = torch.rand(2, 4) + model = WrapFunction(lambda input: input.any(dim, keepdim=keepdim)) + pytorch_output = model(input) + deploy_cfg_ort = Config( + dict( + onnx_config=dict(input_shape=None), + backend_config=dict(type='onnxruntime'), + codebase_config=dict(type='mmdet', task='ObjectDetection'))) + rewrite_output, _ = get_rewrite_outputs( + model, + model_inputs={'input': input}, + deploy_cfg=deploy_cfg_ort, + run_with_backend=True) + assert pytorch_output.dtype == rewrite_output[0].dtype + assert torch.allclose( + pytorch_output.float(), + rewrite_output[0].float(), + rtol=1e-3, + atol=1e-5) + + @backend_checker(Backend.ONNXRUNTIME) def test_linspace__default(): import random