From 9ff826d16bb3ea21bfcc1bf0a98fd97a7c76e1fc Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Wed, 12 Feb 2025 07:10:05 +0000 Subject: [PATCH 1/2] add heuristic for ignore line number spans --- marker/processors/line_numbers.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/marker/processors/line_numbers.py b/marker/processors/line_numbers.py index d38d9131..bdf105eb 100644 --- a/marker/processors/line_numbers.py +++ b/marker/processors/line_numbers.py @@ -24,14 +24,40 @@ class LineNumbersProcessor(BaseProcessor): "The minimum length of a line (in characters) to consider it significant when checking for", "numeric prefixes or suffixes. Prevents false positives for short lines.", ] = 10 + min_line_number_span_ratio: Annotated[ + float, + "The minimum ratio of detected line number spans to total lines required to treat them as line numbers.", + ] = .6 def __init__(self, config): super().__init__(config) def __call__(self, document: Document): + self.ignore_line_number_spans(document) self.ignore_line_starts_ends(document) self.ignore_line_number_blocks(document) + def ignore_line_number_spans(self, document: Document): + for page in document.pages: + line_count = 0 + line_number_spans = [] + for block in page.contained_blocks(document, (BlockTypes.Line,)): + if block.structure is None: + continue + + line_count += 1 + leftmost_span = None + for span in block.contained_blocks(document, (BlockTypes.Span,)): + if leftmost_span is None or span.polygon.x_start < leftmost_span.polygon.x_start: + leftmost_span = span + + if leftmost_span is not None and leftmost_span.text.strip().split(' ')[0].isnumeric(): + line_number_spans.append(leftmost_span) + + if line_count > 0 and len(line_number_spans) / line_count > self.min_line_number_span_ratio: + for span in line_number_spans: + span.ignore_for_output = True + def ignore_line_number_blocks(self, document: Document): for page in document.pages: for block in page.contained_blocks(document, self.block_types): From b71f199011acd5e896ab2b70bb290987d2e6fdf5 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Wed, 12 Feb 2025 08:02:20 +0000 Subject: [PATCH 2/2] fix thinko [skip ci] --- marker/processors/line_numbers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marker/processors/line_numbers.py b/marker/processors/line_numbers.py index bdf105eb..6a4ab86f 100644 --- a/marker/processors/line_numbers.py +++ b/marker/processors/line_numbers.py @@ -51,7 +51,7 @@ def ignore_line_number_spans(self, document: Document): if leftmost_span is None or span.polygon.x_start < leftmost_span.polygon.x_start: leftmost_span = span - if leftmost_span is not None and leftmost_span.text.strip().split(' ')[0].isnumeric(): + if leftmost_span is not None and leftmost_span.text.strip().isnumeric(): line_number_spans.append(leftmost_span) if line_count > 0 and len(line_number_spans) / line_count > self.min_line_number_span_ratio: