Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ add support for multi-receipt extraction #240

Merged
merged 13 commits into from
Jun 12, 2024
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ repos:
- types-requests
- types-setuptools
- importlib-metadata
- types-Pillow
20 changes: 20 additions & 0 deletions examples/multi_receipts_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mindee import Client, PredictResponse, product
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (
extract_receipts,
)

# Init a new client
mindee_client = Client()

# Load a file from disk
input_doc = mindee_client.source_from_path("path/to/your/file.ext")
result_split: PredictResponse = mindee_client.parse(
product.MultiReceiptsDetectorV1, input_doc, close_file=False
)

extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
for receipt in extracted_receipts:
receipt_as_source = receipt.as_source()
# receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
print(result_receipt.document)
Empty file.
6 changes: 6 additions & 0 deletions mindee/image_extraction/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from mindee.image_extraction.common.extracted_image import ExtractedImage
from mindee.image_extraction.common.image_extractor import (
attach_image_as_new_file,
extract_multiple_images_from_image,
extract_multiple_images_from_page,
)
58 changes: 58 additions & 0 deletions mindee/image_extraction/common/extracted_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import io
from pathlib import Path
from typing import Optional

from PIL import Image

from mindee.error import MindeeError
from mindee.input import FileInput
from mindee.logger import logger


class ExtractedImage:
"""Generic class for image extraction."""

def __init__(self, buffer: bytes, file_name: str):
"""
Initialize the ExtractedImage with a buffer and an internal file name.

:param buffer: The byte buffer representing the image.
:param file_name: The internal file name of the image.
"""
self.buffer = io.BytesIO(buffer)
self.internal_file_name = file_name
self.buffer.name = self.internal_file_name

def save_to_file(self, output_path: str, file_format: Optional[str] = None):
"""
Saves the document to a file.

:param output_path: Path to save the file to.
:param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided.
:raises MindeeError: If an invalid path or filename is provided.
"""
try:
resolved_path = Path(output_path).resolve()
if not file_format:
if len(resolved_path.suffix) < 1:
raise ValueError("Invalid file format.")
file_format = (
resolved_path.suffix.upper()
) # technically redundant since PIL applies an upper operation
# to the parameter , but older versions may not do so.
self.buffer.seek(0)
image = Image.open(self.buffer)
image.save(resolved_path, format=file_format)
logger.info("File saved successfully to '%s'.", resolved_path)
except TypeError as exc:
raise MindeeError("Invalid path/filename provided.") from exc
except Exception as exc:
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc

def as_source(self) -> FileInput:
"""
Return the file as a Mindee-compatible BufferInput source.

:returns: A BufferInput source.
"""
return FileInput(self.buffer)
87 changes: 87 additions & 0 deletions mindee/image_extraction/common/image_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import io
from pathlib import Path
from typing import BinaryIO, List, Union

import pypdfium2 as pdfium
from PIL import Image

from mindee.geometry import Point, get_min_max_x, get_min_max_y


def attach_image_as_new_file( # type: ignore
input_buffer: BinaryIO,
) -> pdfium.PdfDocument:
"""
Attaches an image as a new page in a PdfDocument object.

:param input_buffer: Input buffer. Only supports JPEG.
:return: A PdfDocument handle.
"""
# Create a new page in the PdfDocument
input_buffer.seek(0)
image = Image.open(input_buffer)
image.convert("RGB")
image_buffer = io.BytesIO()
image.save(image_buffer, format="JPEG")

pdf = pdfium.PdfDocument.new()

image_pdf = pdfium.PdfImage.new(pdf)
image_pdf.load_jpeg(image_buffer)
width, height = image_pdf.get_size()

matrix = pdfium.PdfMatrix().scale(width, height)
image_pdf.set_matrix(matrix)

page = pdf.new_page(width, height)
page.insert_obj(image_pdf)
page.gen_content()
image.close()
return pdf


def extract_multiple_images_from_image(
image: Union[bytes, str, Path], polygons: List[List[Point]]
) -> List[Image.Image]:
"""
Extracts elements from an image based on a list of bounding boxes.

:param image: Image as a path
:param polygons: List of coordinates to pull the elements from.
:return: List of byte arrays representing the extracted elements.
"""
return extract_multiple_images_from_page(Image.open(image), polygons)


def extract_multiple_images_from_page( # type: ignore
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
page: Union[pdfium.PdfPage, Image.Image], polygons: List[List[Point]]
) -> List[Image.Image]:
"""
Extracts elements from a page based on a list of bounding boxes.

:param page: Single PDF Page. If the page is a pdfium.PdfPage, it is rasterized first.
:param polygons: List of coordinates to pull the elements from.
:return: List of byte arrays representing the extracted elements.
"""
if isinstance(page, pdfium.PdfPage):
page_content = page.render().to_pil()
width, height = page.get_size()
else:
page_content = page
width, height = page.size

extracted_elements = []
for polygon in polygons:
min_max_x = get_min_max_x(polygon)
min_max_y = get_min_max_y(polygon)

left = min_max_x.min * width
right = min_max_x.max * width
top = min_max_y.min * height
bottom = min_max_y.max * height

extracted_elements.append(
page_content.crop((int(left), int(top), int(right), int(bottom)))
)

return extracted_elements
3 changes: 3 additions & 0 deletions mindee/image_extraction/multi_receipts_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
ExtractedMultiReceiptsImage,
)
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from mindee.image_extraction.common import ExtractedImage


class ExtractedMultiReceiptsImage(ExtractedImage):
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
"""Wrapper class for extracted multiple-receipts images."""

_receipt_id: int
_page_id: int

def __init__(self, buffer, receipt_id: int, page_id: int):
super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf")
self._receipt_id = receipt_id
self._page_id = page_id

@property
def receipt_id(self):
"""
ID of the receipt on a given page.

:return:
"""
return self._receipt_id

@property
def page_id(self):
"""
ID of the page the receipt was found on.

:return:
"""
return self._page_id
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import io
from typing import List

import pypdfium2 as pdfium

from mindee.error import MindeeError
from mindee.image_extraction.common.image_extractor import (
attach_image_as_new_file,
extract_multiple_images_from_page,
)
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
ExtractedMultiReceiptsImage,
)
from mindee.input import LocalInputSource
from mindee.parsing.common import Inference


def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
"""
Loads a PDF document from a local input source.

:param input_file: Local input.
:return: A valid PdfDocument handle.
"""
input_file.file_object.seek(0)
if input_file.is_pdf():
return pdfium.PdfDocument(input_file.file_object)

return attach_image_as_new_file(input_file.file_object)


def extract_receipts(
input_file: LocalInputSource, inference: Inference
) -> List[ExtractedMultiReceiptsImage]:
"""
Extracts individual receipts from multi-receipts documents.

:param input_file: File to extract sub-receipts from.
:param inference: Results of the inference.
:return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
"""
images: List[ExtractedMultiReceiptsImage] = []
if not inference.prediction.receipts:
raise MindeeError(
"No possible receipts candidates found for MultiReceipts extraction."
)
pdf_doc = load_pdf_doc(input_file)
for page_id, page in enumerate(pdf_doc):
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
receipt_positions = [
receipt.bounding_box
for receipt in inference.pages[page_id].prediction.receipts
]
extracted_receipts = []
receipts = extract_multiple_images_from_page(page, receipt_positions)
for receipt_id, receipt in enumerate(receipts):
buffer = io.BytesIO()
receipt.save(buffer, format="JPEG")
buffer.seek(0)
extracted_receipts.append(
ExtractedMultiReceiptsImage(buffer.read(), receipt_id, page_id)
)
images.extend(extracted_receipts)
return images
2 changes: 1 addition & 1 deletion mindee/input/local_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def is_valid_hmac_signature(
Checks if the hmac signature of the local response is valid.

:param secret_key: Secret key, given as a string.
:param signature:
:param signature: HMAC signature, given as a string.
:return: True if the HMAC signature is valid.
"""
return signature == self.get_hmac_signature(secret_key)
6 changes: 5 additions & 1 deletion mindee/parsing/standard/locale.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ def __init__(
:param reconstructed: Bool for reconstructed object (not extracted in the API)
:param page_id: Page number for multi-page document
"""
value_key = "value" if "value" in raw_prediction else "language"
value_key = (
"value"
if ("value" in raw_prediction and raw_prediction["value"])
else "language"
)

super().__init__(
raw_prediction,
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ safe_licenses = [
"MIT License",
"Mozilla Public License 2.0 (MPL 2.0)",
"BSD License",
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",
"Historical Permission Notice and Disclaimer (HPND)"
]

[tool.pytest.ini_options]
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ include_package_data = True
python_requires = >=3.7
install_requires =
pypdfium2>=4.0,<5
Pillow>=9.5.0
pytz>=2023.3
requests~=2.31

Expand Down
Empty file.
35 changes: 35 additions & 0 deletions tests/image_extraction/test_image_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import json

import pytest
from PIL import Image

from mindee.image_extraction.common import extract_multiple_images_from_image
from mindee.input import PathInput
from mindee.product import BarcodeReaderV1
from tests.test_inputs import PRODUCT_DATA_DIR


@pytest.fixture
def barcode_path():
return PRODUCT_DATA_DIR / "barcode_reader" / "default_sample.jpg"


@pytest.fixture
def barcode_json_path():
return PRODUCT_DATA_DIR / "barcode_reader" / "response_v1" / "complete.json"


def test_barcode_image_extraction(barcode_path, barcode_json_path):
with open(barcode_json_path, "rb") as f:
response = json.load(f)
inference = BarcodeReaderV1(response["document"]["inference"])
barcodes_1 = [code_1d.polygon for code_1d in inference.prediction.codes_1d]
barcodes_2 = [code_2d.polygon for code_2d in inference.prediction.codes_2d]
extracted_barcodes_1d = extract_multiple_images_from_image(barcode_path, barcodes_1)
extracted_barcodes_2d = extract_multiple_images_from_image(barcode_path, barcodes_2)
assert len(extracted_barcodes_1d) == 1
assert len(extracted_barcodes_2d) == 2

assert extracted_barcodes_1d[0].size == (353, 200)
assert extracted_barcodes_2d[0].size == (214, 216)
assert extracted_barcodes_2d[1].size == (193, 201)
Loading
Loading