diff --git a/configs/det/det_repsvtr_db.yml b/configs/det/det_repsvtr_db.yml new file mode 100644 index 0000000000..8c4768e714 --- /dev/null +++ b/configs/det/det_repsvtr_db.yml @@ -0,0 +1,169 @@ +Global: + debug: false + use_gpu: true + epoch_num: &epoch_num 500 + log_smooth_window: 20 + print_batch_step: 100 + save_model_dir: ./output/det_repsvtr_db + save_epoch_step: 10 + eval_batch_step: + - 0 + - 1000 + cal_metric_during_train: false + checkpoints: + pretrained_model: + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true + +Architecture: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: RepSVTR_det + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 #(8*8c) + warmup_epoch: 2 + regularizer: + name: L2 + factor: 5.0e-05 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - CopyPaste: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + total_epoch: *epoch_num + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + total_epoch: *epoch_num + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 8 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/rec/SVTRv2/rec_repsvtr_gtc.yml b/configs/rec/SVTRv2/rec_repsvtr_gtc.yml new file mode 100644 index 0000000000..6d1340ee6f --- /dev/null +++ b/configs/rec/SVTRv2/rec_repsvtr_gtc.yml @@ -0,0 +1,134 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_repsvtr_gtc + save_epoch_step: 10 + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_repsvtr.txt + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + epsilon: 1.e-8 + weight_decay: 0.025 + no_weight_decay_name: norm + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.001 # 8gpus 192bs + warmup_epoch: 5 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: RepSVTR + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + num_decoder_layers: 2 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/SVTRv2/rec_svtrv2_gtc.yml b/configs/rec/SVTRv2/rec_svtrv2_gtc.yml new file mode 100644 index 0000000000..d2ab95ac38 --- /dev/null +++ b/configs/rec/SVTRv2/rec_svtrv2_gtc.yml @@ -0,0 +1,145 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_svtrv2_gtc + save_epoch_step: 10 + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_svrtv2.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + epsilon: 1.e-8 + weight_decay: 0.05 + no_weight_decay_name: norm + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.001 # 8gpus 192bs + warmup_epoch: 5 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: SVTRv2 + use_pos_embed: False + dims: [128, 256, 384] + depths: [6, 6, 6] + num_heads: [4, 8, 12] + mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','Global','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']] + local_k: [[5, 5], [5, 5], [-1, -1]] + sub_k: [[2, 1], [2, 1], [-1, -1]] + last_stage: False + use_pool: True + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + num_decoder_layers: 2 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + + + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/SVTRv2/rec_svtrv2_gtc_distill.yml b/configs/rec/SVTRv2/rec_svtrv2_gtc_distill.yml new file mode 100644 index 0000000000..15d781fc22 --- /dev/null +++ b/configs/rec/SVTRv2/rec_svtrv2_gtc_distill.yml @@ -0,0 +1,208 @@ +Global: + debug: false + use_gpu: true + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_svtrv2_gtc_distill_lr00002/ + save_epoch_step: 5 + eval_batch_step: + - 0 + - 1000 + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_svtrv2_gtc_distill.txt +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 1.e-8 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed patch_embed downsample + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.0002 # 8gpus 192bs + warmup_epoch: 5 +Architecture: + model_type: rec + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: ./output/rec_svtrv2_gtc/best_accuracy + freeze_params: true + return_all_feats: true + model_type: rec + algorithm: SVTR_LCNet + Transform: null + Backbone: + name: SVTRv2 + use_pos_embed: False + dims: [128, 256, 384] + depths: [6, 6, 6] + num_heads: [4, 8, 12] + mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','Global','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']] + local_k: [[5, 5], [5, 5], [-1, -1]] + sub_k: [[2, 1], [2, 1], [-1, -1]] + last_stage: False + use_pool: True + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + num_decoder_layers: 2 + max_text_length: *max_text_length + Student: + pretrained: ./output/rec_repsvtr_gtc/best_accuracy + freeze_params: false + return_all_feats: true + model_type: rec + algorithm: SVTR_LCNet + Transform: null + Backbone: + name: repvit_svtr + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + num_decoder_layers: 2 + max_text_length: *max_text_length +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDKDLoss: + weight: 0.1 + model_name_pairs: + - - Student + - Teacher + key: head_out + multi_head: true + alpha: 1.0 + beta: 2.0 + dis_head: gtc + name: dkd + - DistillationCTCLoss: + weight: 1.0 + model_name_list: + - Student + key: head_out + multi_head: true + - DistillationNRTRLoss: + weight: 1.0 + smoothing: false + model_name_list: + - Student + key: head_out + multi_head: true + - DistillCTCLogits: + weight: 1.0 + reduction: mean + model_name_pairs: + - - Student + - Teacher + key: head_out +PostProcess: + name: DistillationCTCLabelDecode + model_name: + - Student + key: head_out + multi_head: true +Metric: + name: DistillationMetric + base_metric_name: RecMetric + main_indicator: acc + key: Student + + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/doc/doc_ch/algorithm_rec_svtr.md b/doc/doc_ch/algorithm_rec_svtr.md index 42a1a9a415..34881c1146 100644 --- a/doc/doc_ch/algorithm_rec_svtr.md +++ b/doc/doc_ch/algorithm_rec_svtr.md @@ -18,7 +18,7 @@ 论文信息: > [SVTR: Scene Text Recognition with a Single Visual Model](https://arxiv.org/abs/2205.00159) -> Yongkun Du and Zhineng Chen and Caiyan Jia Xiaoting Yin and Tianlun Zheng and Chenxia Li and Yuning Du and Yu-Gang Jiang +> Yongkun Du and Zhineng Chen and Caiyan Jia and Xiaoting Yin and Tianlun Zheng and Chenxia Li and Yuning Du and Yu-Gang Jiang > IJCAI, 2022 场景文本识别旨在将自然图像中的文本转录为数字字符序列,从而传达对场景理解至关重要的高级语义。这项任务由于文本变形、字体、遮挡、杂乱背景等方面的变化具有一定的挑战性。先前的方法为提高识别精度做出了许多工作。然而文本识别器除了准确度外,还因为实际需求需要考虑推理速度等因素。 @@ -102,7 +102,7 @@ python3 tools/infer_rec.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6l ### 4.1 Python推理 -首先将训练得到best模型,转换成inference model。下面以基于`SVTR-T`,在英文数据集训练的模型为例([模型和配置文件下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) ),可以使用如下命令进行转换: +首先将训练得到best模型,转换成inference model。下面以`SVTR-T`在英文数据集训练的模型为例([模型和配置文件下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) ),可以使用如下命令进行转换: ```shell # 注意将pretrained_model的路径设置为本地路径。 diff --git a/doc/doc_ch/algorithm_rec_svtrv2.md b/doc/doc_ch/algorithm_rec_svtrv2.md new file mode 100644 index 0000000000..a508b4f02c --- /dev/null +++ b/doc/doc_ch/algorithm_rec_svtrv2.md @@ -0,0 +1,143 @@ +# 场景文本识别算法-SVTRv2 + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 训练](#3-1) + - [3.2 评估](#3-2) + - [3.3 预测](#3-3) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +### SVTRv2算法简介 + + +[PaddleOCR 算法模型挑战赛 - 赛题一:OCR 端到端识别任务](https://aistudio.baidu.com/competition/detail/1131/0/introduction)排行榜第一算法。主要思路:1、检测和识别模型的Backbone升级为RepSVTR;2、识别教师模型升级为SVTRv2,可识别长文本。 + + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + + +### 3.1 模型训练 + + +训练命令: +```shell +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml + +#多卡训练,通过--gpus参数指定卡号 +# Rec 学生模型 +python -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml +# Rec 教师模型 +python -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/rec/SVTRv2/rec_svtrv2_gtc.yml +# Rec 蒸馏训练 +python -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/rec/SVTRv2/rec_svtrv2_gtc_distill.yml +``` + + +### 3.2 评估 + + +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml -o Global.pretrained_model=output/rec_repsvtr_gtc/best_accuracy +``` + + +### 3.3 预测 + +使用如下命令进行单张图片预测: +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c tools/eval.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml -o Global.pretrained_model=output/rec_repsvtr_gtc/best_accuracy Global.infer_img='./doc/imgs_words_en/word_10.png' +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + + + +## 4. 推理部署 + + +### 4.1 Python推理 +首先将训练得到best模型,转换成inference model,以RepSVTR为例,可以使用如下命令进行转换: + +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml -o Global.pretrained_model=output/rec_repsvtr_gtc/best_accuracy Global.save_inference_dir=./inference/rec_repsvtr_infer +``` + +**注意:** +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否为所正确的字典文件。 + +转换成功后,在目录下有三个文件: +``` +./inference/rec_repsvtr_infer/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + + +执行如下命令进行模型推理: + +```shell +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_repsvtr_infer/' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` +![](../imgs_words_en/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: +```shell +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) +``` + +**注意**: + +- 如果您调整了训练时的输入分辨率,需要通过参数`rec_image_shape`设置为您需要的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中SVTR的预处理为您的预处理方法。 + + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持SVTRv2 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + +## 5. FAQ + +## 引用 + +```bibtex +@article{Du2022SVTR, + title = {SVTR: Scene Text Recognition with a Single Visual Model}, + author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Zheng, Tianlun and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang}, + booktitle = {IJCAI}, + year = {2022}, + url = {https://arxiv.org/abs/2205.00159} +} +``` diff --git a/ppocr/losses/rec_multi_loss.py b/ppocr/losses/rec_multi_loss.py index c19febe535..74be385651 100644 --- a/ppocr/losses/rec_multi_loss.py +++ b/ppocr/losses/rec_multi_loss.py @@ -55,7 +55,7 @@ def forward(self, predicts, batch): ) elif name == "NRTRLoss": loss = ( - loss_func(predicts["nrtr"], batch[:1] + batch[2:])["loss"] + loss_func(predicts["gtc"], batch[:1] + batch[2:])["loss"] * self.weight_2 ) else: diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index ce80afd109..0b64992b20 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -25,6 +25,7 @@ def build_backbone(config, model_type): from .rec_lcnetv3 import PPLCNetV3 from .rec_hgnet import PPHGNet_small from .rec_vit import ViT + from .rec_repvit import RepSVTR_det support_dict = [ "MobileNetV3", @@ -34,6 +35,7 @@ def build_backbone(config, model_type): "PPLCNet", "PPLCNetV3", "PPHGNet_small", + "RepSVTR_det", ] if model_type == "table": from .table_master_resnet import TableResNetExtra @@ -59,6 +61,8 @@ def build_backbone(config, model_type): from .rec_lcnetv3 import PPLCNetV3 from .rec_hgnet import PPHGNet_small from .rec_vit_parseq import ViTParseQ + from .rec_repvit import RepSVTR + from .rec_svtrv2 import SVTRv2 support_dict = [ "MobileNetV1Enhance", @@ -81,6 +85,8 @@ def build_backbone(config, model_type): "PPHGNet_small", "ViTParseQ", "ViT", + "RepSVTR", + "SVTRv2", ] elif model_type == "e2e": from .e2e_resnet_vd_pg import ResNet diff --git a/ppocr/modeling/backbones/rec_repvit.py b/ppocr/modeling/backbones/rec_repvit.py new file mode 100644 index 0000000000..e983569c44 --- /dev/null +++ b/ppocr/modeling/backbones/rec_repvit.py @@ -0,0 +1,363 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/THU-MIG/RepViT +""" + +import paddle.nn as nn +import paddle +from paddle.nn.initializer import TruncatedNormal, Constant, Normal + +trunc_normal_ = TruncatedNormal(std=0.02) +normal_ = Normal +zeros_ = Constant(value=0.0) +ones_ = Constant(value=1.0) + + +def _make_divisible(v, divisor, min_value=None): + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8 + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + :param v: + :param divisor: + :param min_value: + :return: + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +# from timm.models.layers import SqueezeExcite + + +def make_divisible(v, divisor=8, min_value=None, round_limit=0.9): + min_value = min_value or divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < round_limit * v: + new_v += divisor + return new_v + + +class SEModule(nn.Layer): + """SE Module as defined in original SE-Nets with a few additions + Additions include: + * divisor can be specified to keep channels % div == 0 (default: 8) + * reduction channels can be specified directly by arg (if rd_channels is set) + * reduction channels can be specified by float rd_ratio (default: 1/16) + * global max pooling can be added to the squeeze aggregation + * customizable activation, normalization, and gate layer + """ + + def __init__( + self, + channels, + rd_ratio=1.0 / 16, + rd_channels=None, + rd_divisor=8, + act_layer=nn.ReLU, + ): + super(SEModule, self).__init__() + if not rd_channels: + rd_channels = make_divisible( + channels * rd_ratio, rd_divisor, round_limit=0.0 + ) + self.fc1 = nn.Conv2D(channels, rd_channels, kernel_size=1, bias_attr=True) + self.act = act_layer() + self.fc2 = nn.Conv2D(rd_channels, channels, kernel_size=1, bias_attr=True) + + def forward(self, x): + x_se = x.mean((2, 3), keepdim=True) + x_se = self.fc1(x_se) + x_se = self.act(x_se) + x_se = self.fc2(x_se) + return x * nn.functional.sigmoid(x_se) + + +class Conv2D_BN(nn.Sequential): + def __init__( + self, + a, + b, + ks=1, + stride=1, + pad=0, + dilation=1, + groups=1, + bn_weight_init=1, + resolution=-10000, + ): + super().__init__() + self.add_sublayer( + "c", nn.Conv2D(a, b, ks, stride, pad, dilation, groups, bias_attr=False) + ) + self.add_sublayer("bn", nn.BatchNorm2D(b)) + if bn_weight_init == 1: + ones_(self.bn.weight) + else: + zeros_(self.bn.weight) + zeros_(self.bn.bias) + + @paddle.no_grad() + def fuse(self): + c, bn = self.c, self.bn + w = bn.weight / (bn._variance + bn._epsilon) ** 0.5 + w = c.weight * w[:, None, None, None] + b = bn.bias - bn._mean * bn.weight / (bn._variance + bn._epsilon) ** 0.5 + m = nn.Conv2D( + w.shape[1] * self.c._groups, + w.shape[0], + w.shape[2:], + stride=self.c._stride, + padding=self.c._padding, + dilation=self.c._dilation, + groups=self.c._groups, + ) + m.weight.set_value(w) + m.bias.set_value(b) + return m + + +class Residual(nn.Layer): + def __init__(self, m, drop=0.0): + super().__init__() + self.m = m + self.drop = drop + + def forward(self, x): + if self.training and self.drop > 0: + return ( + x + + self.m(x) + * paddle.rand(x.size(0), 1, 1, 1) + .ge_(self.drop) + .div(1 - self.drop) + .detach() + ) + else: + return x + self.m(x) + + @paddle.no_grad() + def fuse(self): + if isinstance(self.m, Conv2D_BN): + m = self.m.fuse() + assert m._groups == m.in_channels + identity = paddle.ones([m.weight.shape[0], m.weight.shape[1], 1, 1]) + identity = nn.functional.pad(identity, [1, 1, 1, 1]) + m.weight += identity + return m + elif isinstance(self.m, nn.Conv2D): + m = self.m + assert m._groups != m.in_channels + identity = paddle.ones([m.weight.shape[0], m.weight.shape[1], 1, 1]) + identity = nn.functional.pad(identity, [1, 1, 1, 1]) + m.weight += identity + return m + else: + return self + + +class RepVGGDW(nn.Layer): + def __init__(self, ed) -> None: + super().__init__() + self.conv = Conv2D_BN(ed, ed, 3, 1, 1, groups=ed) + self.conv1 = nn.Conv2D(ed, ed, 1, 1, 0, groups=ed) + self.dim = ed + self.bn = nn.BatchNorm2D(ed) + + def forward(self, x): + return self.bn((self.conv(x) + self.conv1(x)) + x) + + @paddle.no_grad() + def fuse(self): + conv = self.conv.fuse() + conv1 = self.conv1 + + conv_w = conv.weight + conv_b = conv.bias + conv1_w = conv1.weight + conv1_b = conv1.bias + + conv1_w = nn.functional.pad(conv1_w, [1, 1, 1, 1]) + + identity = nn.functional.pad( + paddle.ones([conv1_w.shape[0], conv1_w.shape[1], 1, 1]), [1, 1, 1, 1] + ) + + final_conv_w = conv_w + conv1_w + identity + final_conv_b = conv_b + conv1_b + + conv.weight.set_value(final_conv_w) + conv.bias.set_value(final_conv_b) + + bn = self.bn + w = bn.weight / (bn._variance + bn._epsilon) ** 0.5 + w = conv.weight * w[:, None, None, None] + b = ( + bn.bias + + (conv.bias - bn._mean) * bn.weight / (bn._variance + bn._epsilon) ** 0.5 + ) + conv.weight.set_value(w) + conv.bias.set_value(b) + return conv + + +class RepViTBlock(nn.Layer): + def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs): + super(RepViTBlock, self).__init__() + + self.identity = stride == 1 and inp == oup + assert hidden_dim == 2 * inp + + if stride != 1: + self.token_mixer = nn.Sequential( + Conv2D_BN( + inp, inp, kernel_size, stride, (kernel_size - 1) // 2, groups=inp + ), + SEModule(inp, 0.25) if use_se else nn.Identity(), + Conv2D_BN(inp, oup, ks=1, stride=1, pad=0), + ) + self.channel_mixer = Residual( + nn.Sequential( + # pw + Conv2D_BN(oup, 2 * oup, 1, 1, 0), + nn.GELU() if use_hs else nn.GELU(), + # pw-linear + Conv2D_BN(2 * oup, oup, 1, 1, 0, bn_weight_init=0), + ) + ) + else: + assert self.identity + self.token_mixer = nn.Sequential( + RepVGGDW(inp), + SEModule(inp, 0.25) if use_se else nn.Identity(), + ) + self.channel_mixer = Residual( + nn.Sequential( + # pw + Conv2D_BN(inp, hidden_dim, 1, 1, 0), + nn.GELU() if use_hs else nn.GELU(), + # pw-linear + Conv2D_BN(hidden_dim, oup, 1, 1, 0, bn_weight_init=0), + ) + ) + + def forward(self, x): + return self.channel_mixer(self.token_mixer(x)) + + +class RepViT(nn.Layer): + def __init__(self, cfgs, in_channels=3, out_indices=None): + super(RepViT, self).__init__() + # setting of inverted residual blocks + self.cfgs = cfgs + + # building first layer + input_channel = self.cfgs[0][2] + patch_embed = nn.Sequential( + Conv2D_BN(in_channels, input_channel // 2, 3, 2, 1), + nn.GELU(), + Conv2D_BN(input_channel // 2, input_channel, 3, 2, 1), + ) + layers = [patch_embed] + # building inverted residual blocks + block = RepViTBlock + for k, t, c, use_se, use_hs, s in self.cfgs: + output_channel = _make_divisible(c, 8) + exp_size = _make_divisible(input_channel * t, 8) + layers.append( + block(input_channel, exp_size, output_channel, k, s, use_se, use_hs) + ) + input_channel = output_channel + self.features = nn.LayerList(layers) + self.out_indices = out_indices + if out_indices is not None: + self.out_channels = [self.cfgs[ids - 1][2] for ids in out_indices] + else: + self.out_channels = self.cfgs[-1][2] + + def forward(self, x): + if self.out_indices is not None: + return self.forward_det(x) + return self.forward_rec(x) + + def forward_det(self, x): + outs = [] + for i, f in enumerate(self.features): + x = f(x) + if i in self.out_indices: + outs.append(x) + return outs + + def forward_rec(self, x): + for f in self.features: + x = f(x) + h = x.shape[2] + x = nn.functional.avg_pool2d(x, [h, 2]) + return x + + +def RepSVTR(in_channels=3): + """ + Constructs a MobileNetV3-Large model + """ + # k, t, c, SE, HS, s + cfgs = [ + [3, 2, 96, 1, 0, 1], + [3, 2, 96, 0, 0, 1], + [3, 2, 96, 0, 0, 1], + [3, 2, 192, 0, 1, (2, 1)], + [3, 2, 192, 1, 1, 1], + [3, 2, 192, 0, 1, 1], + [3, 2, 192, 1, 1, 1], + [3, 2, 192, 0, 1, 1], + [3, 2, 192, 1, 1, 1], + [3, 2, 192, 0, 1, 1], + [3, 2, 384, 0, 1, (2, 1)], + [3, 2, 384, 1, 1, 1], + [3, 2, 384, 0, 1, 1], + ] + return RepViT(cfgs, in_channels=in_channels) + + +def RepSVTR_det(in_channels=3, out_indices=[2, 5, 10, 13]): + """ + Constructs a MobileNetV3-Large model + """ + # k, t, c, SE, HS, s + cfgs = [ + [3, 2, 48, 1, 0, 1], + [3, 2, 48, 0, 0, 1], + [3, 2, 96, 0, 0, 2], + [3, 2, 96, 1, 0, 1], + [3, 2, 96, 0, 0, 1], + [3, 2, 192, 0, 1, 2], + [3, 2, 192, 1, 1, 1], + [3, 2, 192, 0, 1, 1], + [3, 2, 192, 1, 1, 1], + [3, 2, 192, 0, 1, 1], + [3, 2, 384, 0, 1, 2], + [3, 2, 384, 1, 1, 1], + [3, 2, 384, 0, 1, 1], + ] + return RepViT(cfgs, in_channels=in_channels, out_indices=out_indices) diff --git a/ppocr/modeling/backbones/rec_svtrv2.py b/ppocr/modeling/backbones/rec_svtrv2.py new file mode 100644 index 0000000000..31ce55a65a --- /dev/null +++ b/ppocr/modeling/backbones/rec_svtrv2.py @@ -0,0 +1,575 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import ParamAttr +from paddle.nn.initializer import KaimingNormal +import numpy as np +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal + +trunc_normal_ = TruncatedNormal(std=0.02) +normal_ = Normal +zeros_ = Constant(value=0.0) +ones_ = Constant(value=1.0) + + +def drop_path(x, drop_prob=0.0, training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) + shape = (paddle.shape(x)[0],) + (1,) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Layer): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + bias_attr=False, + groups=1, + act=nn.GELU, + ): + super().__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()), + bias_attr=bias_attr, + ) + self.norm = nn.BatchNorm2D(out_channels) + self.act = act() + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + out = self.act(out) + return out + + +class Attention(nn.Layer): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.num_heads = num_heads + self.dim = dim + self.head_dim = dim // num_heads + self.scale = qk_scale or self.head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + qkv = ( + self.qkv(x) + .reshape((0, -1, 3, self.num_heads, self.head_dim)) + .transpose((2, 0, 3, 1, 4)) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, -1, self.dim)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + epsilon=1e-6, + ): + super().__init__() + self.norm1 = norm_layer(dim, epsilon=epsilon) + self.mixer = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() + self.norm2 = norm_layer(dim, epsilon=epsilon) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp_ratio = mlp_ratio + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x): + x = self.norm1(x + self.drop_path(self.mixer(x))) + x = self.norm2(x + self.drop_path(self.mlp(x))) + return x + + +class ConvBlock(nn.Layer): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + epsilon=1e-6, + ): + super().__init__() + mlp_hidden_dim = int(dim * mlp_ratio) + self.norm1 = norm_layer(dim, epsilon=epsilon) + self.mixer = nn.Conv2D( + dim, + dim, + 5, + 1, + 2, + groups=num_heads, + weight_attr=ParamAttr(initializer=KaimingNormal()), + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() + self.norm2 = norm_layer(dim, epsilon=epsilon) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x): + C, H, W = x.shape[1:] + x = x + self.drop_path(self.mixer(x)) + x = self.norm1(x.flatten(2).transpose([0, 2, 1])) + x = self.norm2(x + self.drop_path(self.mlp(x))) + x = x.transpose([0, 2, 1]).reshape([0, C, H, W]) + return x + + +class FlattenTranspose(nn.Layer): + def forward(self, x): + return x.flatten(2).transpose([0, 2, 1]) + + +class SubSample2D(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + stride=[2, 1], + ): + super().__init__() + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + ) + self.norm = nn.LayerNorm(out_channels) + + def forward(self, x, sz): + # print(x.shape) + x = self.conv(x) + C, H, W = x.shape[1:] + x = self.norm(x.flatten(2).transpose([0, 2, 1])) + x = x.transpose([0, 2, 1]).reshape([0, C, H, W]) + return x, [H, W] + + +class SubSample1D(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + stride=[2, 1], + ): + super().__init__() + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + ) + self.norm = nn.LayerNorm(out_channels) + + def forward(self, x, sz): + C = x.shape[-1] + x = x.transpose([0, 2, 1]).reshape([0, C, sz[0], sz[1]]) + x = self.conv(x) + C, H, W = x.shape[1:] + x = self.norm(x.flatten(2).transpose([0, 2, 1])) + return x, [H, W] + + +class IdentitySize(nn.Layer): + def forward(self, x, sz): + return x, sz + + +class SVTRStage(nn.Layer): + def __init__( + self, + dim=64, + out_dim=256, + depth=3, + mixer=["Local"] * 3, + sub_k=[2, 1], + num_heads=2, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path=[0.1] * 3, + norm_layer=nn.LayerNorm, + act=nn.GELU, + eps=1e-6, + downsample=None, + **kwargs + ): + super().__init__() + self.dim = dim + + conv_block_num = sum([1 if mix == "Conv" else 0 for mix in mixer]) + blocks = [] + for i in range(depth): + if mixer[i] == "Conv": + blocks.append( + ConvBlock( + dim=dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + drop=drop_rate, + act_layer=act, + drop_path=drop_path[i], + norm_layer=norm_layer, + epsilon=eps, + ) + ) + else: + blocks.append( + Block( + dim=dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=act, + attn_drop=attn_drop_rate, + drop_path=drop_path[i], + norm_layer=norm_layer, + epsilon=eps, + ) + ) + if i == conv_block_num - 1 and mixer[-1] != "Conv": + blocks.append(FlattenTranspose()) + self.blocks = nn.Sequential(*blocks) + if downsample: + if mixer[-1] == "Conv": + self.downsample = SubSample2D(dim, out_dim, stride=sub_k) + elif mixer[-1] == "Global": + self.downsample = SubSample1D(dim, out_dim, stride=sub_k) + else: + self.downsample = IdentitySize() + + def forward(self, x, sz): + x = self.blocks(x) + x, sz = self.downsample(x, sz) + return x, sz + + +class ADDPosEmbed(nn.Layer): + def __init__(self, feat_max_size=[8, 32], embed_dim=768): + super().__init__() + pos_embed = paddle.zeros( + [1, feat_max_size[0] * feat_max_size[1], embed_dim], dtype=paddle.float32 + ) + trunc_normal_(pos_embed) + pos_embed = pos_embed.transpose([0, 2, 1]).reshape( + [1, embed_dim, feat_max_size[0], feat_max_size[1]] + ) + self.pos_embed = self.create_parameter( + [1, embed_dim, feat_max_size[0], feat_max_size[1]] + ) + self.add_parameter("pos_embed", self.pos_embed) + self.pos_embed.set_value(pos_embed) + + def forward(self, x): + sz = x.shape[2:] + x = x + self.pos_embed[:, :, : sz[0], : sz[1]] + return x + + +class POPatchEmbed(nn.Layer): + """Image to Patch Embedding""" + + def __init__( + self, + in_channels=3, + feat_max_size=[8, 32], + embed_dim=768, + use_pos_embed=False, + flatten=False, + ): + super().__init__() + patch_embed = [ + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None, + ), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None, + ), + ] + if use_pos_embed: + patch_embed.append(ADDPosEmbed(feat_max_size, embed_dim)) + if flatten: + patch_embed.append(FlattenTranspose()) + self.patch_embed = nn.Sequential(*patch_embed) + + def forward(self, x): + sz = x.shape[2:] + x = self.patch_embed(x) + return x, [sz[0] // 4, sz[1] // 4] + + +class LastStage(nn.Layer): + def __init__(self, in_channels, out_channels, last_drop, out_char_num): + super().__init__() + self.last_conv = nn.Linear(in_channels, out_channels, bias_attr=False) + self.hardswish = nn.Hardswish() + self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer") + + def forward(self, x, sz): + x = x.reshape([0, sz[0], sz[1], x.shape[-1]]) + x = x.mean(1) + x = self.last_conv(x) + x = self.hardswish(x) + x = self.dropout(x) + return x, [1, sz[1]] + + +class OutPool(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, sz): + C = x.shape[-1] + x = x.transpose([0, 2, 1]).reshape([0, C, sz[0], sz[1]]) + x = nn.functional.avg_pool2d(x, [sz[0], 2]) + return x, [1, sz[1] // 2] + + +class Feat2D(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, sz): + C = x.shape[-1] + x = x.transpose([0, 2, 1]).reshape([0, C, sz[0], sz[1]]) + return x, sz + + +class SVTRv2(nn.Layer): + def __init__( + self, + max_sz=[32, 128], + in_channels=3, + out_channels=192, + out_char_num=25, + depths=[3, 6, 3], + dims=[64, 128, 256], + mixer=[["Conv"] * 3, ["Conv"] * 3 + ["Global"] * 3, ["Global"] * 3], + use_pos_embed=False, + sub_k=[[1, 1], [2, 1], [1, 1]], + num_heads=[2, 4, 8], + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + last_drop=0.1, + attn_drop_rate=0.0, + drop_path_rate=0.1, + norm_layer=nn.LayerNorm, + act=nn.GELU, + last_stage=False, + eps=1e-6, + use_pool=False, + feat2d=False, + **kwargs + ): + super().__init__() + num_stages = len(depths) + self.num_features = dims[-1] + + feat_max_size = [max_sz[0] // 4, max_sz[1] // 4] + self.pope = POPatchEmbed( + in_channels=in_channels, + feat_max_size=feat_max_size, + embed_dim=dims[0], + use_pos_embed=use_pos_embed, + flatten=mixer[0][0] != "Conv", + ) + + dpr = np.linspace(0, drop_path_rate, sum(depths)) # stochastic depth decay rule + + self.stages = nn.LayerList() + for i_stage in range(num_stages): + stage = SVTRStage( + dim=dims[i_stage], + out_dim=dims[i_stage + 1] if i_stage < num_stages - 1 else 0, + depth=depths[i_stage], + mixer=mixer[i_stage], + sub_k=sub_k[i_stage], + num_heads=num_heads[i_stage], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_stage]) : sum(depths[: i_stage + 1])], + norm_layer=norm_layer, + act=act, + downsample=False if i_stage == num_stages - 1 else True, + eps=eps, + ) + self.stages.append(stage) + + self.out_channels = self.num_features + self.last_stage = last_stage + if last_stage: + self.out_channels = out_channels + self.stages.append( + LastStage(self.num_features, out_channels, last_drop, out_char_num) + ) + if use_pool: + self.stages.append(OutPool()) + + if feat2d: + self.stages.append(Feat2D()) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, x): + x, sz = self.pope(x) + for stage in self.stages: + x, sz = stage(x, sz) + return x diff --git a/ppocr/modeling/heads/rec_multi_head.py b/ppocr/modeling/heads/rec_multi_head.py index c7005c108e..50887d7c86 100644 --- a/ppocr/modeling/heads/rec_multi_head.py +++ b/ppocr/modeling/heads/rec_multi_head.py @@ -149,5 +149,5 @@ def forward(self, x, targets=None): head_out["sar"] = sar_out else: gtc_out = self.gtc_head(self.before_gtc(x), targets[1:]) - head_out["nrtr"] = gtc_out + head_out["gtc"] = gtc_out return head_out