Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert PDFs to PNGs before scanning #113

Merged
merged 1 commit into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion build/configs/scanners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -408,13 +408,16 @@ scanners:
- 'jpeg_file'
- 'image/png'
- 'png_file'
- 'image/tiff'
- 'image/tif'
- 'type_is_tiff'
- 'image/x-ms-bmp'
- 'image/bmp'
- 'bmp_file'
- 'image/webp'
- 'pdf_file'
priority: 5
options:
pdf_to_png: True
'ScanRar':
- positive:
flavors:
Expand Down
3 changes: 3 additions & 0 deletions configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,10 @@ scanners:
- 'type_is_tiff'
- 'image/x-ms-bmp'
- 'bmp_file'
- 'pdf_file'
priority: 5
options:
pdf_to_png: True
'ScanRar':
- positive:
flavors:
Expand Down
5 changes: 3 additions & 2 deletions src/python/strelka/scanners/scan_ocr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import fitz
import os
import subprocess
import tempfile

import fitz
from strelka import strelka


Expand All @@ -16,6 +16,7 @@ class ScanOcr(strelka.Scanner):
tmp_directory: Location where tempfile writes temporary files.
Defaults to '/tmp/'.
"""

def scan(self, data, file, options, expire_at):
extract_text = options.get('extract_text', False)
tmp_directory = options.get('tmp_directory', '/tmp/')
Expand All @@ -34,7 +35,7 @@ def scan(self, data, file, options, expire_at):
tess_return = subprocess.call(
['tesseract', tmp_data.name, tmp_tess.name],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
stderr=subprocess.DEVNULL,
)
tess_txt_name = f'{tmp_tess.name}.txt'
if tess_return == 0:
Expand Down
4 changes: 3 additions & 1 deletion src/python/strelka/scanners/scan_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,10 @@ def scan(self, data, file, options, expire_at):
for link in links:
if "uri" in link:
self.event["annotated_uris"].append(link["uri"])
if extract_text:
if extract_text and hasattr(page, "getText"):
extracted_text += page.getText()
if extract_text and hasattr(page, "get_text"):
extracted_text += page.get_text()

# PDF Text Extraction
# Caution: Will increase time and object storage size
Expand Down
16 changes: 13 additions & 3 deletions src/python/strelka/scanners/scan_qr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from pyzbar.pyzbar import decode
from PIL import Image
import io
import re

import fitz
from PIL import Image
from pyzbar.pyzbar import decode, ZBarSymbol
from strelka import strelka

# Regex to match URL
Expand All @@ -14,9 +15,18 @@ class ScanQr(strelka.Scanner):
"""
Collects QR code metadata from image files.
"""

def scan(self, data, file, options, expire_at):
pdf_to_png = options.get('pdf_to_png', False)

try:
barcodes = decode(Image.open(io.BytesIO(data)))
if pdf_to_png and 'application/pdf' in file.flavors.get('mime', []):
# TODO: Use fitz builtin OCR support which also wraps tesseract
doc = fitz.open(stream=data, filetype='pdf')
data = doc.get_page_pixmap(0, dpi=150).tobytes()

img = Image.open(io.BytesIO(data))
barcodes = decode(img, symbols=[ZBarSymbol.QRCODE])

try:
self.event['data'] = barcodes[0].data.decode('utf-8')
Expand Down
Loading