From ebe1024390c1671113590fc1d4796cb7242aea72 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 12 Feb 2025 15:20:29 -0500 Subject: [PATCH] Filter equation overlaps --- marker/builders/line.py | 7 +++++++ marker/processors/llm/llm_text.py | 4 ---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index 284626ff..44b7321f 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -155,6 +155,13 @@ def get_all_lines(self, document: Document, provider: PdfProvider, do_inline_mat image_size, page_size ) + detection_result = self.filter_equation_overlaps( + document, + document_page, + detection_result, + image_size, + page_size + ) # Merge text and inline math detection results merged_detection_boxes = self.determine_math_lines(text_result=detection_result, inline_result=inline_detection_result) diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py index 3e6d96f6..ec26cdcb 100644 --- a/marker/processors/llm/llm_text.py +++ b/marker/processors/llm/llm_text.py @@ -5,13 +5,11 @@ from PIL import Image from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData -from bs4 import BeautifulSoup from marker.processors.util import add_math_spans_to_line from marker.schema import BlockTypes from marker.schema.blocks import Block from marker.schema.document import Document -from marker.schema.registry import get_block_class from marker.schema.text import Line @@ -95,8 +93,6 @@ def inference_blocks(self, document: Document) -> List[List[BlockData]]: out_blocks.append(batch) return out_blocks - - def get_block_lines(self, block: Block, document: Document) -> Tuple[list, list]: text_lines = block.contained_blocks(document, (BlockTypes.Line,)) extracted_lines = [line.formatted_text(document) for line in text_lines]