From 5d28d39cba35773a0e31dd9d7282cfc40c3cfa9f Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 13 Feb 2025 16:41:06 -0500 Subject: [PATCH 1/3] Adjust OCR thresh --- marker/builders/line.py | 12 ++++++++++-- marker/processors/table.py | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index 4391c7e8..91b3ef87 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -3,7 +3,7 @@ import numpy as np from ftfy import fix_text -from PIL import Image, ImageDraw +from PIL import Image from surya.detection import DetectionPredictor, InlineDetectionPredictor, TextDetectionResult from surya.ocr_error import OCRErrorPredictor @@ -55,6 +55,10 @@ class LineBuilder(BaseBuilder): "The minimum coverage ratio required for the layout model to consider", "the lines from the PdfProvider valid.", ] = .25 + min_document_ocr_threshold: Annotated[ + float, + "If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not." + ] = 0.85 span_inline_math_overlap_threshold: Annotated[ float, "The minimum overlap of a span with an inline math box to consider for removal" @@ -163,12 +167,16 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat for document_page, ocr_error_detection_label in zip(document.pages, ocr_error_detection_results.labels): provider_lines: List[ProviderOutput] = provider.page_lines.get(document_page.page_id, []) provider_lines_good = all([ - bool(provider), + bool(provider_lines), ocr_error_detection_label != 'bad', self.check_layout_coverage(document_page, provider_lines) ]) layout_good.append(provider_lines_good) + # Don't OCR if only a few pages are bad + if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold: + layout_good = [True] * len(document.pages) + run_detection = [not good or do_inline_math_detection for good in layout_good] page_images = [page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks) for page, good in zip(document.pages, run_detection) if good] detection_results, inline_detection_results = self.get_detection_results(page_images, run_detection, do_inline_math_detection) diff --git a/marker/processors/table.py b/marker/processors/table.py index 8bd2831e..84a76b92 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -12,6 +12,8 @@ from pdftext.extraction import table_output from marker.processors import BaseProcessor +from marker.providers.pdf import PdfProvider +from marker.providers.registry import provider_from_filepath from marker.schema import BlockTypes from marker.schema.blocks.tablecell import TableCell from marker.schema.document import Document From 78a14bbaa1cfae5c7ca7fdc849ce2295a0899dfc Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 13 Feb 2025 16:53:55 -0500 Subject: [PATCH 2/3] OCR irregular docs --- marker/builders/line.py | 6 +++++- marker/processors/table.py | 2 -- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index 91b3ef87..bc38e4bd 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -126,7 +126,6 @@ def get_ocr_error_batch_size(self): return 4 def get_detection_results(self, page_images: List[Image.Image], run_detection: List[bool], do_inline_math_detection: bool): - page_images = [p for p, good in zip(page_images, run_detection) if good] page_detection_results = self.detection_model( images=page_images, batch_size=self.get_detection_batch_size() @@ -152,10 +151,12 @@ def get_detection_results(self, page_images: List[Image.Image], run_detection: L inline_results.append(None) assert idx == len(page_images) + assert len(run_detection) == len(detection_results) == len(inline_results) return detection_results, inline_results def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_math_detection: bool): + assert len(document.pages) == len(provider.page_lines) ocr_error_detection_results = self.ocr_error_detection(document.pages, provider.page_lines) boxes_to_ocr = {page.page_id: [] for page in document.pages} @@ -179,6 +180,9 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat run_detection = [not good or do_inline_math_detection for good in layout_good] page_images = [page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks) for page, good in zip(document.pages, run_detection) if good] + + # Note: run_detection is longer than page_images, since it has a value for each page, not just good ones + # Detection results and inline detection results are for every page (we use run_detection to make the list full length) detection_results, inline_detection_results = self.get_detection_results(page_images, run_detection, do_inline_math_detection) assert len(detection_results) == len(inline_detection_results) == len(layout_good) == len(document.pages) diff --git a/marker/processors/table.py b/marker/processors/table.py index 84a76b92..8bd2831e 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -12,8 +12,6 @@ from pdftext.extraction import table_output from marker.processors import BaseProcessor -from marker.providers.pdf import PdfProvider -from marker.providers.registry import provider_from_filepath from marker.schema import BlockTypes from marker.schema.blocks.tablecell import TableCell from marker.schema.document import Document From cdd482b79e4db8ead5cd8ac2da4bd64d385282e8 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 13 Feb 2025 20:03:37 -0500 Subject: [PATCH 3/3] Bump version --- marker/builders/line.py | 1 - pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index bc38e4bd..dd39ad39 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -156,7 +156,6 @@ def get_detection_results(self, page_images: List[Image.Image], run_detection: L def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_math_detection: bool): - assert len(document.pages) == len(provider.page_lines) ocr_error_detection_results = self.ocr_error_detection(document.pages, provider.page_lines) boxes_to_ocr = {page.page_id: [] for page in document.pages} diff --git a/pyproject.toml b/pyproject.toml index b3a17f5c..9ab9583c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.5.0" +version = "1.5.1" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"