Skip to content

Commit

Permalink
Merge pull request #545 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Adjust OCR thresh
  • Loading branch information
VikParuchuri authored Feb 14, 2025
2 parents b87d088 + cdd482b commit 9721c17
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 4 deletions.
17 changes: 14 additions & 3 deletions marker/builders/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import numpy as np
from ftfy import fix_text
from PIL import Image, ImageDraw
from PIL import Image

from surya.detection import DetectionPredictor, InlineDetectionPredictor, TextDetectionResult
from surya.ocr_error import OCRErrorPredictor
Expand Down Expand Up @@ -55,6 +55,10 @@ class LineBuilder(BaseBuilder):
"The minimum coverage ratio required for the layout model to consider",
"the lines from the PdfProvider valid.",
] = .25
min_document_ocr_threshold: Annotated[
float,
"If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not."
] = 0.85
span_inline_math_overlap_threshold: Annotated[
float,
"The minimum overlap of a span with an inline math box to consider for removal"
Expand Down Expand Up @@ -122,7 +126,6 @@ def get_ocr_error_batch_size(self):
return 4

def get_detection_results(self, page_images: List[Image.Image], run_detection: List[bool], do_inline_math_detection: bool):
page_images = [p for p, good in zip(page_images, run_detection) if good]
page_detection_results = self.detection_model(
images=page_images,
batch_size=self.get_detection_batch_size()
Expand All @@ -148,6 +151,7 @@ def get_detection_results(self, page_images: List[Image.Image], run_detection: L
inline_results.append(None)
assert idx == len(page_images)

assert len(run_detection) == len(detection_results) == len(inline_results)
return detection_results, inline_results


Expand All @@ -163,14 +167,21 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat
for document_page, ocr_error_detection_label in zip(document.pages, ocr_error_detection_results.labels):
provider_lines: List[ProviderOutput] = provider.page_lines.get(document_page.page_id, [])
provider_lines_good = all([
bool(provider),
bool(provider_lines),
ocr_error_detection_label != 'bad',
self.check_layout_coverage(document_page, provider_lines)
])
layout_good.append(provider_lines_good)

# Don't OCR if only a few pages are bad
if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
layout_good = [True] * len(document.pages)

run_detection = [not good or do_inline_math_detection for good in layout_good]
page_images = [page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks) for page, good in zip(document.pages, run_detection) if good]

# Note: run_detection is longer than page_images, since it has a value for each page, not just good ones
# Detection results and inline detection results are for every page (we use run_detection to make the list full length)
detection_results, inline_detection_results = self.get_detection_results(page_images, run_detection, do_inline_math_detection)

assert len(detection_results) == len(inline_detection_results) == len(layout_good) == len(document.pages)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "1.5.0"
version = "1.5.1"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <github@vikas.sh>"]
readme = "README.md"
Expand Down

0 comments on commit 9721c17

Please sign in to comment.