Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ add support for multi-receipt extraction #240

Merged
merged 13 commits into from
Jun 12, 2024
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ repos:
- types-requests
- types-setuptools
- importlib-metadata
- types-Pillow
20 changes: 20 additions & 0 deletions examples/multi_receipts_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mindee import Client, PredictResponse, product
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (
extract_receipts,
)

# Init a new client
mindee_client = Client()

# Load a file from disk
input_doc = mindee_client.source_from_path("path/to/your/file.ext")
result_split: PredictResponse = mindee_client.parse(
product.MultiReceiptsDetectorV1, input_doc, close_file=False
)

extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
for receipt in extracted_receipts:
receipt_as_source = receipt.as_source()
# receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
print(result_receipt.document)
Empty file.
5 changes: 5 additions & 0 deletions mindee/image_extraction/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from mindee.image_extraction.common.extracted_image import ExtractedImage
from mindee.image_extraction.common.image_extractor import (
attach_image_as_new_file,
extract_from_page,
)
48 changes: 48 additions & 0 deletions mindee/image_extraction/common/extracted_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import io
from pathlib import Path

from mindee.error import MindeeError
from mindee.input import FileInput
from mindee.logger import logger


class ExtractedImage:
"""Generic class for image extraction."""

def __init__(self, buffer: bytes, file_name: str):
"""
Initialize the ExtractedImage with a buffer and an internal file name.

:param buffer: The byte buffer representing the image.
:param file_name: The internal file name of the image.
"""
self.buffer = io.BytesIO(buffer)
self.internal_file_name = file_name
self.buffer.name = self.internal_file_name

def save_to_file(self, output_path: str):
"""
Saves the document to a file.

:param output_path: Path to save the file to.
:param file_name: Name of the file.
:raises MindeeError: If an invalid path or filename is provided.
"""
try:
self.buffer.seek(0)
resolved_path = Path(output_path).resolve()
with open(resolved_path, "wb") as file:
file.write(self.buffer.read())
logger.info("File saved successfully to '%s'.", resolved_path)
except TypeError as exc:
raise MindeeError("Invalid path/filename provided.") from exc
except Exception as exc:
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc

def as_source(self) -> FileInput:
"""
Return the file as a Mindee-compatible BufferInput source.

:returns: A BufferInput source.
"""
return FileInput(self.buffer)
72 changes: 72 additions & 0 deletions mindee/image_extraction/common/image_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import io
from typing import BinaryIO, List

import pypdfium2 as pdfium
from PIL import Image

from mindee.geometry import Polygon, get_min_max_x, get_min_max_y


def attach_image_as_new_file( # type: ignore
input_buffer: BinaryIO,
) -> pdfium.PdfDocument:
"""
Attaches an image as a new page in a PdfDocument object.

:param input_buffer: Input buffer. Only supports JPEG.
:return: A PdfDocument handle.
"""
# Create a new page in the PdfDocument
input_buffer.seek(0)
image = Image.open(input_buffer)
image.convert("RGB")
image_buffer = io.BytesIO()
image.save(image_buffer, format="JPEG")

pdf = pdfium.PdfDocument.new()

image_pdf = pdfium.PdfImage.new(pdf)
image_pdf.load_jpeg(image_buffer)
width, height = image_pdf.get_size()

matrix = pdfium.PdfMatrix().scale(width, height)
image_pdf.set_matrix(matrix)

page = pdf.new_page(width, height)
page.insert_obj(image_pdf)
page.gen_content()
image.close()
return pdf


def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]) -> List[bytes]: # type: ignore
"""
Extracts elements from a page based on a list of bounding boxes.

:param pdf_page: Single PDF Page.
:param polygons: List of coordinates to pull the elements from.
:return: List of byte arrays representing the extracted elements.
"""
width, height = pdf_page.get_size()

extracted_elements = []
for polygon in polygons:
min_max_x = get_min_max_x(polygon)
min_max_y = get_min_max_y(polygon)

left = min_max_x.min * width
right = min_max_x.max * width
top = min_max_y.min * height
bottom = min_max_y.max * height

# Note: cropping done via PIL instead of PyPDFium to simplify operations greatly.
cropped_content_pil = pdf_page.render().to_pil()
cropped_content_pil = cropped_content_pil.crop(
(int(left), int(top), int(right), int(bottom))
)
jpeg_buffer = io.BytesIO()
cropped_content_pil.save(jpeg_buffer, format="PDF")
jpeg_buffer.seek(0)
extracted_elements.append(jpeg_buffer.read())

return extracted_elements
6 changes: 6 additions & 0 deletions mindee/image_extraction/multi_receipts_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
ExtractedMultiReceiptsImage,
)
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (
extract_receipts_from_page,
)
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from mindee.image_extraction.common import ExtractedImage


class ExtractedMultiReceiptsImage(ExtractedImage):
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
"""Wrapper class for extracted multiple-receipts images."""

_receipt_id: int
_page_id: int

def __init__(self, buffer, receipt_id: int, page_id: int):
super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf")
self._receipt_id = receipt_id
self._page_id = page_id

@property
def receipt_id(self):
"""
ID of the receipt on a given page.

:return:
"""
return self._receipt_id

@property
def page_id(self):
"""
ID of the page the receipt was found on.

:return:
"""
return self._page_id
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from typing import List, Union

import pypdfium2 as pdfium

from mindee.error import MimeTypeError, MindeeError
from mindee.geometry.point import Point
from mindee.geometry.polygon import Polygon
from mindee.geometry.quadrilateral import Quadrilateral
from mindee.image_extraction.common.image_extractor import (
attach_image_as_new_file,
extract_from_page,
)
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
ExtractedMultiReceiptsImage,
)
from mindee.input import LocalInputSource
from mindee.parsing.common import Inference


def extract_receipts_from_page( # type: ignore
pdf_page: pdfium.PdfPage,
bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]],
page_id: int,
) -> List[ExtractedMultiReceiptsImage]:
"""
Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptsImage\
object.

:param pdf_page: PDF Page to extract from.
:param bounding_boxes: A set of coordinates delimiting the position of each receipt.
:param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF
pages.
:return: A list of ExtractedMultiReceiptsImage.
"""
extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) # type: ignore
extracted_receipts = []
for i, extracted_receipt_raw in enumerate(extracted_receipts_raw):
extracted_receipts.append(
ExtractedMultiReceiptsImage(extracted_receipt_raw, i, page_id)
)
return extracted_receipts


def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
"""
Loads a PDF document from a local input source.

:param input_file: Local input.
:return: A valid PdfDocument handle.
"""
if input_file.file_mimetype not in [
"application/pdf",
"image/heic",
"image/png",
"image/jpg",
"image/jpeg",
"image/tiff",
"image/webp",
]:
raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'.")
input_file.file_object.seek(0)
if input_file.is_pdf():
return pdfium.PdfDocument(input_file.file_object)

return attach_image_as_new_file(input_file.file_object)


def extract_receipts(
input_file: LocalInputSource, inference: Inference
) -> List[ExtractedMultiReceiptsImage]:
"""
Extracts individual receipts from multi-receipts documents.

:param input_file: File to extract sub-receipts from.
:param inference: Results of the inference.
:return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
"""
images: List[ExtractedMultiReceiptsImage] = []
if not inference.prediction.receipts:
raise MindeeError(
"No possible receipts candidates found for MultiReceipts extraction."
)
pdf_doc = load_pdf_doc(input_file)
for page_id, page in enumerate(pdf_doc):
sebastianMindee marked this conversation as resolved.
Show resolved Hide resolved
receipt_positions = [
receipt.bounding_box
for receipt in inference.pages[page_id].prediction.receipts
]
extracted_receipts = extract_receipts_from_page(
page, receipt_positions, page_id
)
images.extend(extracted_receipts)
return images
2 changes: 1 addition & 1 deletion mindee/input/local_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def is_valid_hmac_signature(
Checks if the hmac signature of the local response is valid.

:param secret_key: Secret key, given as a string.
:param signature:
:param signature: HMAC signature, given as a string.
:return: True if the HMAC signature is valid.
"""
return signature == self.get_hmac_signature(secret_key)
6 changes: 5 additions & 1 deletion mindee/parsing/standard/locale.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ def __init__(
:param reconstructed: Bool for reconstructed object (not extracted in the API)
:param page_id: Page number for multi-page document
"""
value_key = "value" if "value" in raw_prediction else "language"
value_key = (
"value"
if ("value" in raw_prediction and raw_prediction["value"])
else "language"
)

super().__init__(
raw_prediction,
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ safe_licenses = [
"MIT License",
"Mozilla Public License 2.0 (MPL 2.0)",
"BSD License",
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",
"Historical Permission Notice and Disclaimer (HPND)"
]

[tool.pytest.ini_options]
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ include_package_data = True
python_requires = >=3.7
install_requires =
pypdfium2>=4.0,<5
Pillow>=9.5.0
pytz>=2023.3
requests~=2.31

Expand Down
Empty file.
42 changes: 42 additions & 0 deletions tests/image_extraction/test_image_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from io import BytesIO

import pypdfium2 as pdfium
import pytest
from PIL import Image

from mindee.error import MimeTypeError
from tests.test_inputs import PRODUCT_DATA_DIR


@pytest.fixture
def single_page_path():
return PRODUCT_DATA_DIR / "multi_receipts_detector" / "default_sample.jpg"


@pytest.fixture
def multiple_pages_path():
return PRODUCT_DATA_DIR / "multi_receipts_detector" / "multipage_sample.pdf"


def test_get_images_mono_page(single_page_path):
with open(single_page_path, "rb") as f:
jpg_file = Image.open(single_page_path)
jpg_height = jpg_file.size[0]
jpg_width = jpg_file.size[1]
assert jpg_height == 3628
assert jpg_width == 1552


def test_get_images_multiple_pages(multiple_pages_path):
with open(multiple_pages_path, "rb") as f:
pdf = pdfium.PdfDocument(f)
pdf_images = [page.render().to_pil() for page in pdf]
height_page_0 = pdf_images[0].size[0]
width_page_0 = pdf_images[0].size[1]
assert height_page_0 == 595
assert width_page_0 == 842

height_page_1 = pdf_images[1].size[0]
width_page_1 = pdf_images[1].size[1]
assert height_page_1 == 595
assert width_page_1 == 842
Loading
Loading