From 5d28d39cba35773a0e31dd9d7282cfc40c3cfa9f Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Thu, 13 Feb 2025 16:41:06 -0500
Subject: [PATCH 1/3] Adjust OCR thresh

---
 marker/builders/line.py    | 12 ++++++++++--
 marker/processors/table.py |  2 ++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/marker/builders/line.py b/marker/builders/line.py
index 4391c7e8..91b3ef87 100644
--- a/marker/builders/line.py
+++ b/marker/builders/line.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from ftfy import fix_text
-from PIL import Image, ImageDraw
+from PIL import Image
 
 from surya.detection import DetectionPredictor, InlineDetectionPredictor, TextDetectionResult
 from surya.ocr_error import OCRErrorPredictor
@@ -55,6 +55,10 @@ class LineBuilder(BaseBuilder):
         "The minimum coverage ratio required for the layout model to consider",
         "the lines from the PdfProvider valid.",
     ] = .25
+    min_document_ocr_threshold: Annotated[
+        float,
+        "If less pages than this threshold are good, OCR will happen in the document.  Otherwise it will not."
+    ] = 0.85
     span_inline_math_overlap_threshold: Annotated[
         float,
         "The minimum overlap of a span with an inline math box to consider for removal"
@@ -163,12 +167,16 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat
         for document_page, ocr_error_detection_label in zip(document.pages, ocr_error_detection_results.labels):
             provider_lines: List[ProviderOutput] = provider.page_lines.get(document_page.page_id, [])
             provider_lines_good = all([
-                bool(provider),
+                bool(provider_lines),
                 ocr_error_detection_label != 'bad',
                 self.check_layout_coverage(document_page, provider_lines)
             ])
             layout_good.append(provider_lines_good)
 
+        # Don't OCR if only a few pages are bad
+        if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
+            layout_good = [True] * len(document.pages)
+
         run_detection = [not good or do_inline_math_detection for good in layout_good]
         page_images = [page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks) for page, good in zip(document.pages, run_detection) if good]
         detection_results, inline_detection_results = self.get_detection_results(page_images, run_detection, do_inline_math_detection)
diff --git a/marker/processors/table.py b/marker/processors/table.py
index 8bd2831e..84a76b92 100644
--- a/marker/processors/table.py
+++ b/marker/processors/table.py
@@ -12,6 +12,8 @@
 from pdftext.extraction import table_output
 
 from marker.processors import BaseProcessor
+from marker.providers.pdf import PdfProvider
+from marker.providers.registry import provider_from_filepath
 from marker.schema import BlockTypes
 from marker.schema.blocks.tablecell import TableCell
 from marker.schema.document import Document

From 78a14bbaa1cfae5c7ca7fdc849ce2295a0899dfc Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Thu, 13 Feb 2025 16:53:55 -0500
Subject: [PATCH 2/3] OCR irregular docs

---
 marker/builders/line.py    | 6 +++++-
 marker/processors/table.py | 2 --
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/marker/builders/line.py b/marker/builders/line.py
index 91b3ef87..bc38e4bd 100644
--- a/marker/builders/line.py
+++ b/marker/builders/line.py
@@ -126,7 +126,6 @@ def get_ocr_error_batch_size(self):
         return 4
 
     def get_detection_results(self, page_images: List[Image.Image], run_detection: List[bool], do_inline_math_detection: bool):
-        page_images = [p for p, good in zip(page_images, run_detection) if good]
         page_detection_results = self.detection_model(
             images=page_images,
             batch_size=self.get_detection_batch_size()
@@ -152,10 +151,12 @@ def get_detection_results(self, page_images: List[Image.Image], run_detection: L
                 inline_results.append(None)
         assert idx == len(page_images)
 
+        assert len(run_detection) == len(detection_results) == len(inline_results)
         return detection_results, inline_results
 
 
     def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_math_detection: bool):
+        assert len(document.pages) == len(provider.page_lines)
         ocr_error_detection_results = self.ocr_error_detection(document.pages, provider.page_lines)
 
         boxes_to_ocr = {page.page_id: [] for page in document.pages}
@@ -179,6 +180,9 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat
 
         run_detection = [not good or do_inline_math_detection for good in layout_good]
         page_images = [page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks) for page, good in zip(document.pages, run_detection) if good]
+
+        # Note: run_detection is longer than page_images, since it has a value for each page, not just good ones
+        # Detection results and inline detection results are for every page (we use run_detection to make the list full length)
         detection_results, inline_detection_results = self.get_detection_results(page_images, run_detection, do_inline_math_detection)
 
         assert len(detection_results) == len(inline_detection_results) == len(layout_good) == len(document.pages)
diff --git a/marker/processors/table.py b/marker/processors/table.py
index 84a76b92..8bd2831e 100644
--- a/marker/processors/table.py
+++ b/marker/processors/table.py
@@ -12,8 +12,6 @@
 from pdftext.extraction import table_output
 
 from marker.processors import BaseProcessor
-from marker.providers.pdf import PdfProvider
-from marker.providers.registry import provider_from_filepath
 from marker.schema import BlockTypes
 from marker.schema.blocks.tablecell import TableCell
 from marker.schema.document import Document

From cdd482b79e4db8ead5cd8ac2da4bd64d385282e8 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Thu, 13 Feb 2025 20:03:37 -0500
Subject: [PATCH 3/3] Bump version

---
 marker/builders/line.py | 1 -
 pyproject.toml          | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/marker/builders/line.py b/marker/builders/line.py
index bc38e4bd..dd39ad39 100644
--- a/marker/builders/line.py
+++ b/marker/builders/line.py
@@ -156,7 +156,6 @@ def get_detection_results(self, page_images: List[Image.Image], run_detection: L
 
 
     def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_math_detection: bool):
-        assert len(document.pages) == len(provider.page_lines)
         ocr_error_detection_results = self.ocr_error_detection(document.pages, provider.page_lines)
 
         boxes_to_ocr = {page.page_id: [] for page in document.pages}
diff --git a/pyproject.toml b/pyproject.toml
index b3a17f5c..9ab9583c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.5.0"
+version = "1.5.1"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"