From 4aff541b2e3654c8a5077eeaf97d99db065df31b Mon Sep 17 00:00:00 2001 From: UserUnknownFactor <63057995+UserUnknownFactor@users.noreply.github.com> Date: Tue, 20 Jun 2023 10:13:10 +0300 Subject: [PATCH 1/2] Don't break overall processing on a bad image --- paddleocr.py | 18 ++++++++++++------ tools/infer/predict_system.py | 22 ++++++++++++++++------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/paddleocr.py b/paddleocr.py index c76f09a4b2..77227e30b3 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -619,12 +619,12 @@ def __init__(self, **kwargs): def ocr(self, img, det=True, rec=True, cls=True): """ - ocr with paddleocr + OCR with PaddleOCR args: - img: img for ocr, support ndarray, img_path and list or ndarray - det: use text detection or not. If false, only rec will be exec. Default is True - rec: use text recognition or not. If false, only det will be exec. Default is True - cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. + img: img for OCR, support ndarray, img_path and list or ndarray + det: use text detection or not. If False, only rec will be exec. Default is True + rec: use text recognition or not. If False, only det will be exec. Default is True + cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. """ assert isinstance(img, (np.ndarray, list, str, bytes)) if isinstance(img, list) and det == True: @@ -632,7 +632,7 @@ def ocr(self, img, det=True, rec=True, cls=True): exit(0) if cls == True and self.use_angle_cls == False: logger.warning( - 'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process' + 'Since the angle classifier is not initialized, it will not be used during the forward process' ) img = check_img(img) @@ -647,6 +647,9 @@ def ocr(self, img, det=True, rec=True, cls=True): ocr_res = [] for idx, img in enumerate(imgs): dt_boxes, rec_res, _ = self.__call__(img, cls) + if not dt_boxes and not rec_res: + ocr_res.append(None) + continue tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] ocr_res.append(tmp_res) @@ -655,6 +658,9 @@ def ocr(self, img, det=True, rec=True, cls=True): ocr_res = [] for idx, img in enumerate(imgs): dt_boxes, elapse = self.text_detector(img) + if not dt_boxes: + ocr_res.append(None) + continue tmp_res = [box.tolist() for box in dt_boxes] ocr_res.append(tmp_res) return ocr_res diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index 3ddcfda619..8af45b4cf5 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -65,15 +65,25 @@ def draw_crop_rec_res(self, output_dir, img_crop_list, rec_res): self.crop_image_res_index += bbox_num def __call__(self, img, cls=True): - time_dict = {'det': 0, 'rec': 0, 'csl': 0, 'all': 0} + time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0} + + if img is None: + logger.debug("no valid image provided") + return None, None, time_dict + start = time.time() ori_im = img.copy() dt_boxes, elapse = self.text_detector(img) time_dict['det'] = elapse - logger.debug("dt_boxes num : {}, elapse : {}".format( - len(dt_boxes), elapse)) + if dt_boxes is None: - return None, None + logger.debug("no dt_boxes found, elapsed : {}".format(elapse)) + end = time.time() + time_dict['all'] = end - start + return None, None, time_dict + else: + logger.debug("dt_boxes num : {}, elapsed : {}".format( + len(dt_boxes), elapse)) img_crop_list = [] dt_boxes = sorted_boxes(dt_boxes) @@ -89,12 +99,12 @@ def __call__(self, img, cls=True): img_crop_list, angle_list, elapse = self.text_classifier( img_crop_list) time_dict['cls'] = elapse - logger.debug("cls num : {}, elapse : {}".format( + logger.debug("cls num : {}, elapsed : {}".format( len(img_crop_list), elapse)) rec_res, elapse = self.text_recognizer(img_crop_list) time_dict['rec'] = elapse - logger.debug("rec_res num : {}, elapse : {}".format( + logger.debug("rec_res num : {}, elapsed : {}".format( len(rec_res), elapse)) if self.args.save_crop_res: self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, From c57570aab39aee6ba8a1e935654ff92e1937de84 Mon Sep 17 00:00:00 2001 From: UserUnknownFactor <63057995+UserUnknownFactor@users.noreply.github.com> Date: Wed, 19 Jul 2023 13:08:05 +0300 Subject: [PATCH 2/2] Add preprocessing common to OCR tasks Add preprocessing to options --- paddleocr.py | 34 +++++++++++++++++++++++++++------- ppocr/utils/utility.py | 19 +++++++++++++++++++ ppstructure/utility.py | 17 ++++++++++++++++- tools/infer/utility.py | 4 +++- 4 files changed, 65 insertions(+), 9 deletions(-) diff --git a/paddleocr.py b/paddleocr.py index 77227e30b3..dc92cbf6b7 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -46,7 +46,7 @@ def _import_file(module_name, file_path, make_importable=False): ppstructure = importlib.import_module('ppstructure', 'paddleocr') from ppocr.utils.logging import get_logger from tools.infer import predict_system -from ppocr.utils.utility import check_and_read, get_image_file_list +from ppocr.utils.utility import check_and_read, get_image_file_list, alpha_to_color, binarize_img from ppocr.utils.network import maybe_download, download_with_progressbar, is_link, confirm_model_dir_url from tools.infer.utility import draw_ocr, str2bool, check_gpu from ppstructure.utility import init_args, draw_structure_result @@ -513,7 +513,7 @@ def get_model_config(type, version, model_type, lang): def img_decode(content: bytes): np_arr = np.frombuffer(content, dtype=np.uint8) - return cv2.imdecode(np_arr, cv2.IMREAD_COLOR) + return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED) def check_img(img): @@ -617,7 +617,7 @@ def __init__(self, **kwargs): super().__init__(params) self.page_num = params.page_num - def ocr(self, img, det=True, rec=True, cls=True): + def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_color=(255, 255, 255)): """ OCR with PaddleOCR args: @@ -625,6 +625,9 @@ def ocr(self, img, det=True, rec=True, cls=True): det: use text detection or not. If False, only rec will be exec. Default is True rec: use text recognition or not. If False, only det will be exec. Default is True cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. + bin: binarize image to black and white. Default is False. + inv: invert image colors. Default is False. + alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white. """ assert isinstance(img, (np.ndarray, list, str, bytes)) if isinstance(img, list) and det == True: @@ -643,9 +646,19 @@ def ocr(self, img, det=True, rec=True, cls=True): imgs = img[:self.page_num] else: imgs = [img] + + def preprocess_image(_image): + _image = alpha_to_color(_image, alpha_color) + if inv: + _image = cv2.bitwise_not(_image) + if bin: + _image = binarize_img(_image) + return _image + if det and rec: ocr_res = [] for idx, img in enumerate(imgs): + img = preprocess_image(img) dt_boxes, rec_res, _ = self.__call__(img, cls) if not dt_boxes and not rec_res: ocr_res.append(None) @@ -657,6 +670,7 @@ def ocr(self, img, det=True, rec=True, cls=True): elif det and not rec: ocr_res = [] for idx, img in enumerate(imgs): + img = preprocess_image(img) dt_boxes, elapse = self.text_detector(img) if not dt_boxes: ocr_res.append(None) @@ -669,6 +683,7 @@ def ocr(self, img, det=True, rec=True, cls=True): cls_res = [] for idx, img in enumerate(imgs): if not isinstance(img, list): + img = preprocess_image(img) img = [img] if self.use_angle_cls and cls: img, cls_res_tmp, elapse = self.text_classifier(img) @@ -770,10 +785,15 @@ def main(): img_name = os.path.basename(img_path).split('.')[0] logger.info('{}{}{}'.format('*' * 10, img_path, '*' * 10)) if args.type == 'ocr': - result = engine.ocr(img_path, - det=args.det, - rec=args.rec, - cls=args.use_angle_cls) + result = engine.ocr( + img_path, + det=args.det, + rec=args.rec, + cls=args.use_angle_cls, + bin=args.binarize, + inv=args.invert, + alpha_color=args.alphacolor + ) if result is not None: lines = [] for idx in range(len(result)): diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py index 0f8660ceb4..f788e79cd5 100755 --- a/ppocr/utils/utility.py +++ b/ppocr/utils/utility.py @@ -75,6 +75,25 @@ def get_image_file_list(img_file): imgs_lists = sorted(imgs_lists) return imgs_lists +def binarize_img(img): + if len(img.shape) == 3 and img.shape[2] == 3: + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # conversion to grayscale image + # use cv2 threshold binarization + _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR) + return img + +def alpha_to_color(img, alpha_color=(255, 255, 255)): + if len(img.shape) == 3 and img.shape[2] == 4: + B, G, R, A = cv2.split(img) + alpha = A / 255 + + R = (alpha_color[0] * (1 - alpha) + R * alpha).astype(np.uint8) + G = (alpha_color[1] * (1 - alpha) + G * alpha).astype(np.uint8) + B = (alpha_color[2] * (1 - alpha) + B * alpha).astype(np.uint8) + + img = cv2.merge((B, G, R)) + return img def check_and_read(img_path): if os.path.basename(img_path)[-3:] in ['gif', 'GIF']: diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 28ef3d9f47..4ab4b88b9b 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -16,7 +16,7 @@ import PIL from PIL import Image, ImageDraw, ImageFont import numpy as np -from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args +from tools.infer.utility import draw_ocr_box_txt, str2bool, str2int_tuple, init_args as infer_args import math @@ -100,6 +100,21 @@ def init_args(): type=str2bool, default=False, help='Whether to use pdf2docx api') + parser.add_argument( + "--invert", + type=str2bool, + default=False, + help='Whether to invert image before processing') + parser.add_argument( + "--binarize", + type=str2bool, + default=False, + help='Whether to threshold binarize image before processing') + parser.add_argument( + "--alphacolor", + type=str2int_tuple, + default=(255, 255, 255), + help='Replacement color for the alpha channel, if the latter is present; R,G,B integers') return parser diff --git a/tools/infer/utility.py b/tools/infer/utility.py index fcd8ba7f4d..b064cbf189 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -29,8 +29,10 @@ def str2bool(v): - return v.lower() in ("true", "t", "1") + return v.lower() in ("true", "yes", "t", "y", "1") +def str2int_tuple(v): + return tuple([int(i.strip()) for i in v.split(",")]) def init_args(): parser = argparse.ArgumentParser()