Merge pull request #545 from VikParuchuri/dev

Adjust OCR thresh
VikParuchuri · Feb 14, 2025 · 9721c17 · 9721c17
2 parents b87d088 + cdd482b
commit 9721c17
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 4 deletions.
diff --git a/marker/builders/line.py b/marker/builders/line.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from ftfy import fix_text
-from PIL import Image, ImageDraw
+from PIL import Image
 
 from surya.detection import DetectionPredictor, InlineDetectionPredictor, TextDetectionResult
 from surya.ocr_error import OCRErrorPredictor
@@ -55,6 +55,10 @@ class LineBuilder(BaseBuilder):
         "The minimum coverage ratio required for the layout model to consider",
         "the lines from the PdfProvider valid.",
     ] = .25
+    min_document_ocr_threshold: Annotated[
+        float,
+        "If less pages than this threshold are good, OCR will happen in the document.  Otherwise it will not."
+    ] = 0.85
     span_inline_math_overlap_threshold: Annotated[
         float,
         "The minimum overlap of a span with an inline math box to consider for removal"
@@ -122,7 +126,6 @@ def get_ocr_error_batch_size(self):
         return 4
 
     def get_detection_results(self, page_images: List[Image.Image], run_detection: List[bool], do_inline_math_detection: bool):
-        page_images = [p for p, good in zip(page_images, run_detection) if good]
         page_detection_results = self.detection_model(
             images=page_images,
             batch_size=self.get_detection_batch_size()
@@ -148,6 +151,7 @@ def get_detection_results(self, page_images: List[Image.Image], run_detection: L
                 inline_results.append(None)
         assert idx == len(page_images)
 
+        assert len(run_detection) == len(detection_results) == len(inline_results)
         return detection_results, inline_results
 
 
@@ -163,14 +167,21 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat
         for document_page, ocr_error_detection_label in zip(document.pages, ocr_error_detection_results.labels):
             provider_lines: List[ProviderOutput] = provider.page_lines.get(document_page.page_id, [])
             provider_lines_good = all([
-                bool(provider),
+                bool(provider_lines),
                 ocr_error_detection_label != 'bad',
                 self.check_layout_coverage(document_page, provider_lines)
             ])
             layout_good.append(provider_lines_good)
 
+        # Don't OCR if only a few pages are bad
+        if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
+            layout_good = [True] * len(document.pages)
+
         run_detection = [not good or do_inline_math_detection for good in layout_good]
         page_images = [page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks) for page, good in zip(document.pages, run_detection) if good]
+
+        # Note: run_detection is longer than page_images, since it has a value for each page, not just good ones
+        # Detection results and inline detection results are for every page (we use run_detection to make the list full length)
         detection_results, inline_detection_results = self.get_detection_results(page_images, run_detection, do_inline_math_detection)
 
         assert len(detection_results) == len(inline_detection_results) == len(layout_good) == len(document.pages)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.5.0"
+version = "1.5.1"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"