From fd7e9603c875896804f0f5e595a0e044e8988342 Mon Sep 17 00:00:00 2001 From: Ross Wolf <31489089+rw-access@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:11:11 -0700 Subject: [PATCH] Convert PDFs to PNGs before scanning --- build/configs/scanners.yaml | 5 ++++- configs/python/backend/backend.yaml | 3 +++ src/python/strelka/scanners/scan_ocr.py | 5 +++-- src/python/strelka/scanners/scan_pdf.py | 4 +++- src/python/strelka/scanners/scan_qr.py | 16 +++++++++++++--- 5 files changed, 26 insertions(+), 7 deletions(-) diff --git a/build/configs/scanners.yaml b/build/configs/scanners.yaml index 4cf273e6..77bfc5f5 100644 --- a/build/configs/scanners.yaml +++ b/build/configs/scanners.yaml @@ -408,13 +408,16 @@ scanners: - 'jpeg_file' - 'image/png' - 'png_file' - - 'image/tiff' + - 'image/tif' - 'type_is_tiff' - 'image/x-ms-bmp' - 'image/bmp' - 'bmp_file' - 'image/webp' + - 'pdf_file' priority: 5 + options: + pdf_to_png: True 'ScanRar': - positive: flavors: diff --git a/configs/python/backend/backend.yaml b/configs/python/backend/backend.yaml index c05601bd..017c4083 100644 --- a/configs/python/backend/backend.yaml +++ b/configs/python/backend/backend.yaml @@ -416,7 +416,10 @@ scanners: - 'type_is_tiff' - 'image/x-ms-bmp' - 'bmp_file' + - 'pdf_file' priority: 5 + options: + pdf_to_png: True 'ScanRar': - positive: flavors: diff --git a/src/python/strelka/scanners/scan_ocr.py b/src/python/strelka/scanners/scan_ocr.py index a599a032..cfdc1188 100644 --- a/src/python/strelka/scanners/scan_ocr.py +++ b/src/python/strelka/scanners/scan_ocr.py @@ -1,8 +1,8 @@ -import fitz import os import subprocess import tempfile +import fitz from strelka import strelka @@ -16,6 +16,7 @@ class ScanOcr(strelka.Scanner): tmp_directory: Location where tempfile writes temporary files. Defaults to '/tmp/'. """ + def scan(self, data, file, options, expire_at): extract_text = options.get('extract_text', False) tmp_directory = options.get('tmp_directory', '/tmp/') @@ -34,7 +35,7 @@ def scan(self, data, file, options, expire_at): tess_return = subprocess.call( ['tesseract', tmp_data.name, tmp_tess.name], stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL + stderr=subprocess.DEVNULL, ) tess_txt_name = f'{tmp_tess.name}.txt' if tess_return == 0: diff --git a/src/python/strelka/scanners/scan_pdf.py b/src/python/strelka/scanners/scan_pdf.py index 86e6780e..47afa1a8 100644 --- a/src/python/strelka/scanners/scan_pdf.py +++ b/src/python/strelka/scanners/scan_pdf.py @@ -101,8 +101,10 @@ def scan(self, data, file, options, expire_at): for link in links: if "uri" in link: self.event["annotated_uris"].append(link["uri"]) - if extract_text: + if extract_text and hasattr(page, "getText"): extracted_text += page.getText() + if extract_text and hasattr(page, "get_text"): + extracted_text += page.get_text() # PDF Text Extraction # Caution: Will increase time and object storage size diff --git a/src/python/strelka/scanners/scan_qr.py b/src/python/strelka/scanners/scan_qr.py index 220b60b2..f40f2cb3 100644 --- a/src/python/strelka/scanners/scan_qr.py +++ b/src/python/strelka/scanners/scan_qr.py @@ -1,8 +1,9 @@ -from pyzbar.pyzbar import decode -from PIL import Image import io import re +import fitz +from PIL import Image +from pyzbar.pyzbar import decode, ZBarSymbol from strelka import strelka # Regex to match URL @@ -14,9 +15,18 @@ class ScanQr(strelka.Scanner): """ Collects QR code metadata from image files. """ + def scan(self, data, file, options, expire_at): + pdf_to_png = options.get('pdf_to_png', False) + try: - barcodes = decode(Image.open(io.BytesIO(data))) + if pdf_to_png and 'application/pdf' in file.flavors.get('mime', []): + # TODO: Use fitz builtin OCR support which also wraps tesseract + doc = fitz.open(stream=data, filetype='pdf') + data = doc.get_page_pixmap(0, dpi=150).tobytes() + + img = Image.open(io.BytesIO(data)) + barcodes = decode(img, symbols=[ZBarSymbol.QRCODE]) try: self.event['data'] = barcodes[0].data.decode('utf-8')