Enable using texify for inline math

VikParuchuri · Feb 12, 2025 · 1054a4a · 1054a4a
1 parent c03ff98
commit 1054a4a
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 71 deletions.
diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py
@@ -27,10 +27,6 @@ class OcrBuilder(BaseBuilder):
         bool,
         "Whether to skip OCR on tables.  The TableProcessor will re-OCR them.  Only enable if the TableProcessor is not running.",
     ] = False
-    block_ocr_threshold: Annotated[
-        float,
-        "The minimum fraction of detected lines in a block to OCR the block"
-    ] = 0.05
     languages: Annotated[
         Optional[List[str]],
         "A list of languages to use for OCR.",
@@ -66,9 +62,7 @@ def get_ocr_images_boxes_ids(self, document: Document, provider: PdfProvider):
             image_size = page_highres_image.size
             for block in document_page.contained_blocks(document):
                 block_lines = block.contained_blocks(document, [BlockTypes.Line])
-                block_detected_lines = [block_line for block_line in block_lines if block_line.text_extraction_method=='surya']
-                if len(block_lines)==0 or len(block_detected_lines)/len(block_lines)<self.block_ocr_threshold:
-                    continue
+                block_detected_lines = [block_line for block_line in block_lines if block_line.text_extraction_method == 'surya']
 
                 block.text_extraction_method = 'surya'
                 for line in block_detected_lines:

diff --git a/marker/processors/equation.py b/marker/processors/equation.py
@@ -2,7 +2,9 @@
 
 from marker.models import TexifyPredictor
 from marker.processors import BaseProcessor
+from marker.processors.util import add_math_spans_to_line
 from marker.schema import BlockTypes
+from marker.schema.blocks import Equation
 from marker.schema.document import Document
 from marker.settings import settings
 
@@ -32,11 +34,10 @@ class EquationProcessor(BaseProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
-    inline_math_token_threshold: Annotated[
-        int,
-        "The minimum number of texify-generated tokens to replace inline math with the latex representation",
-        "Prevents replacing single chars and unecessary equations"
-    ] = 20
+    texify_inline_spans: Annotated[
+        bool,
+        "Whether to run texify on inline math spans."
+    ] = False
 
     def __init__(self, texify_model: TexifyPredictor, config=None):
         super().__init__(config)
@@ -47,15 +48,22 @@ def __call__(self, document: Document):
         equation_data = []
 
         for page in document.pages:
-            for block in page.contained_blocks(document, self.block_types):
+            equation_blocks = page.contained_blocks(document, self.block_types)
+            math_blocks = []
+            if self.texify_inline_spans:
+                math_blocks = page.contained_blocks(document, (BlockTypes.Line,))
+                math_blocks = [m for m in math_blocks if m.formats and "math" in m.formats]
+
+            for block in equation_blocks + math_blocks:
                 image = block.get_image(document, highres=False).convert("RGB")
                 raw_text = block.raw_text(document)
                 token_count = self.get_total_texify_tokens(raw_text)
 
                 equation_data.append({
                     "image": image,
                     "block_id": block.id,
-                    "token_count": token_count
+                    "token_count": token_count,
+                    "page": page
                 })
 
         if len(equation_data) == 0:
@@ -71,7 +79,11 @@ def __call__(self, document: Document):
                 continue
 
             block = document.get_block(equation_d["block_id"])
-            block.html = prediction
+            if isinstance(block, Equation):
+                block.html = prediction
+            else:
+                block.structure = []
+                add_math_spans_to_line(prediction, block, equation_d["page"])
 
     def get_batch_size(self):
         if self.texify_batch_size is not None:

diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py
@@ -6,6 +6,8 @@
 
 from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData
 from bs4 import BeautifulSoup
+
+from marker.processors.util import add_math_spans_to_line
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 from marker.schema.document import Document
@@ -141,8 +143,6 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
         blocks = prompt_data["additional_data"]["blocks"]
         pages = prompt_data["additional_data"]["pages"]
 
-        SpanClass = get_block_class(BlockTypes.Span)
-
         if not response or "corrected_lines" not in response:
             blocks[0].update_metadata(llm_error_count=1)
             return
@@ -154,60 +154,7 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
 
         for text_line, page, corrected_text in zip(blocks, pages, corrected_lines):
             text_line.structure = []
-            corrected_spans = self.text_to_spans(corrected_text)
-
-            for span_idx, span in enumerate(corrected_spans):
-                if span_idx == len(corrected_spans) - 1:
-                    span['content'] += "\n"
-
-                span_block = page.add_full_block(
-                    SpanClass(
-                        polygon=text_line.polygon,
-                        text=span['content'],
-                        font='Unknown',
-                        font_weight=0,
-                        font_size=0,
-                        minimum_position=0,
-                        maximum_position=0,
-                        formats=[span['type']],
-                        url=span.get('url'),
-                        page_id=text_line.page_id,
-                        text_extraction_method="gemini",
-                    )
-                )
-                text_line.structure.append(span_block.id)
-
-    @staticmethod
-    def text_to_spans(text):
-        soup = BeautifulSoup(text, 'html.parser')
-
-        tag_types = {
-            'b': 'bold',
-            'i': 'italic',
-            'math': 'math',
-        }
-        spans = []
-
-        for element in soup.descendants:
-            if not len(list(element.parents)) == 1:
-                continue
-
-            url = element.attrs.get('href') if hasattr(element, 'attrs') else None
-
-            if element.name in tag_types:
-                spans.append({
-                    'type': tag_types[element.name],
-                    'content': element.get_text(),
-                    'url': url
-                })
-            elif element.string:
-                spans.append({
-                    'type': 'plain',
-                    'content': element.string,
-                    'url': url
-                })
-
-        return spans
+            add_math_spans_to_line(corrected_text, text_line, page)
 
 class LLMTextSchema(BaseModel):
     corrected_lines: List[str]
diff --git a/marker/processors/util.py b/marker/processors/util.py
@@ -0,0 +1,64 @@
+from bs4 import BeautifulSoup
+
+from marker.schema import BlockTypes
+from marker.schema.groups import PageGroup
+from marker.schema.registry import get_block_class
+from marker.schema.text import Line
+
+
+def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
+    SpanClass = get_block_class(BlockTypes.Span)
+    corrected_spans = text_to_spans(corrected_text)
+
+    for span_idx, span in enumerate(corrected_spans):
+        if span_idx == len(corrected_spans) - 1:
+            span['content'] += "\n"
+
+        span_block = page.add_full_block(
+            SpanClass(
+                polygon=text_line.polygon,
+                text=span['content'],
+                font='Unknown',
+                font_weight=0,
+                font_size=0,
+                minimum_position=0,
+                maximum_position=0,
+                formats=[span['type']],
+                url=span.get('url'),
+                page_id=text_line.page_id,
+                text_extraction_method="gemini",
+            )
+        )
+        text_line.structure.append(span_block.id)
+
+
+def text_to_spans(text):
+    soup = BeautifulSoup(text, 'html.parser')
+
+    tag_types = {
+        'b': 'bold',
+        'i': 'italic',
+        'math': 'math',
+    }
+    spans = []
+
+    for element in soup.descendants:
+        if not len(list(element.parents)) == 1:
+            continue
+
+        url = element.attrs.get('href') if hasattr(element, 'attrs') else None
+
+        if element.name in tag_types:
+            spans.append({
+                'type': tag_types[element.name],
+                'content': element.get_text(),
+                'url': url
+            })
+        elif element.string:
+            spans.append({
+                'type': 'plain',
+                'content': element.string,
+                'url': url
+            })
+
+    return spans