Skip to content

Commit

Permalink
Enable using texify for inline math
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Feb 12, 2025
1 parent c03ff98 commit 1054a4a
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 71 deletions.
8 changes: 1 addition & 7 deletions marker/builders/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ class OcrBuilder(BaseBuilder):
bool,
"Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running.",
] = False
block_ocr_threshold: Annotated[
float,
"The minimum fraction of detected lines in a block to OCR the block"
] = 0.05
languages: Annotated[
Optional[List[str]],
"A list of languages to use for OCR.",
Expand Down Expand Up @@ -66,9 +62,7 @@ def get_ocr_images_boxes_ids(self, document: Document, provider: PdfProvider):
image_size = page_highres_image.size
for block in document_page.contained_blocks(document):
block_lines = block.contained_blocks(document, [BlockTypes.Line])
block_detected_lines = [block_line for block_line in block_lines if block_line.text_extraction_method=='surya']
if len(block_lines)==0 or len(block_detected_lines)/len(block_lines)<self.block_ocr_threshold:
continue
block_detected_lines = [block_line for block_line in block_lines if block_line.text_extraction_method == 'surya']

block.text_extraction_method = 'surya'
for line in block_detected_lines:
Expand Down
28 changes: 20 additions & 8 deletions marker/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from marker.models import TexifyPredictor
from marker.processors import BaseProcessor
from marker.processors.util import add_math_spans_to_line
from marker.schema import BlockTypes
from marker.schema.blocks import Equation
from marker.schema.document import Document
from marker.settings import settings

Expand Down Expand Up @@ -32,11 +34,10 @@ class EquationProcessor(BaseProcessor):
bool,
"Whether to disable the tqdm progress bar.",
] = False
inline_math_token_threshold: Annotated[
int,
"The minimum number of texify-generated tokens to replace inline math with the latex representation",
"Prevents replacing single chars and unecessary equations"
] = 20
texify_inline_spans: Annotated[
bool,
"Whether to run texify on inline math spans."
] = False

def __init__(self, texify_model: TexifyPredictor, config=None):
super().__init__(config)
Expand All @@ -47,15 +48,22 @@ def __call__(self, document: Document):
equation_data = []

for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
equation_blocks = page.contained_blocks(document, self.block_types)
math_blocks = []
if self.texify_inline_spans:
math_blocks = page.contained_blocks(document, (BlockTypes.Line,))
math_blocks = [m for m in math_blocks if m.formats and "math" in m.formats]

for block in equation_blocks + math_blocks:
image = block.get_image(document, highres=False).convert("RGB")
raw_text = block.raw_text(document)
token_count = self.get_total_texify_tokens(raw_text)

equation_data.append({
"image": image,
"block_id": block.id,
"token_count": token_count
"token_count": token_count,
"page": page
})

if len(equation_data) == 0:
Expand All @@ -71,7 +79,11 @@ def __call__(self, document: Document):
continue

block = document.get_block(equation_d["block_id"])
block.html = prediction
if isinstance(block, Equation):
block.html = prediction
else:
block.structure = []
add_math_spans_to_line(prediction, block, equation_d["page"])

def get_batch_size(self):
if self.texify_batch_size is not None:
Expand Down
59 changes: 3 additions & 56 deletions marker/processors/llm/llm_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData
from bs4 import BeautifulSoup

from marker.processors.util import add_math_spans_to_line
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.document import Document
Expand Down Expand Up @@ -141,8 +143,6 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum
blocks = prompt_data["additional_data"]["blocks"]
pages = prompt_data["additional_data"]["pages"]

SpanClass = get_block_class(BlockTypes.Span)

if not response or "corrected_lines" not in response:
blocks[0].update_metadata(llm_error_count=1)
return
Expand All @@ -154,60 +154,7 @@ def rewrite_block(self, response: dict, prompt_data: PromptData, document: Docum

for text_line, page, corrected_text in zip(blocks, pages, corrected_lines):
text_line.structure = []
corrected_spans = self.text_to_spans(corrected_text)

for span_idx, span in enumerate(corrected_spans):
if span_idx == len(corrected_spans) - 1:
span['content'] += "\n"

span_block = page.add_full_block(
SpanClass(
polygon=text_line.polygon,
text=span['content'],
font='Unknown',
font_weight=0,
font_size=0,
minimum_position=0,
maximum_position=0,
formats=[span['type']],
url=span.get('url'),
page_id=text_line.page_id,
text_extraction_method="gemini",
)
)
text_line.structure.append(span_block.id)

@staticmethod
def text_to_spans(text):
soup = BeautifulSoup(text, 'html.parser')

tag_types = {
'b': 'bold',
'i': 'italic',
'math': 'math',
}
spans = []

for element in soup.descendants:
if not len(list(element.parents)) == 1:
continue

url = element.attrs.get('href') if hasattr(element, 'attrs') else None

if element.name in tag_types:
spans.append({
'type': tag_types[element.name],
'content': element.get_text(),
'url': url
})
elif element.string:
spans.append({
'type': 'plain',
'content': element.string,
'url': url
})

return spans
add_math_spans_to_line(corrected_text, text_line, page)

class LLMTextSchema(BaseModel):
corrected_lines: List[str]
64 changes: 64 additions & 0 deletions marker/processors/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from bs4 import BeautifulSoup

from marker.schema import BlockTypes
from marker.schema.groups import PageGroup
from marker.schema.registry import get_block_class
from marker.schema.text import Line


def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
SpanClass = get_block_class(BlockTypes.Span)
corrected_spans = text_to_spans(corrected_text)

for span_idx, span in enumerate(corrected_spans):
if span_idx == len(corrected_spans) - 1:
span['content'] += "\n"

span_block = page.add_full_block(
SpanClass(
polygon=text_line.polygon,
text=span['content'],
font='Unknown',
font_weight=0,
font_size=0,
minimum_position=0,
maximum_position=0,
formats=[span['type']],
url=span.get('url'),
page_id=text_line.page_id,
text_extraction_method="gemini",
)
)
text_line.structure.append(span_block.id)


def text_to_spans(text):
soup = BeautifulSoup(text, 'html.parser')

tag_types = {
'b': 'bold',
'i': 'italic',
'math': 'math',
}
spans = []

for element in soup.descendants:
if not len(list(element.parents)) == 1:
continue

url = element.attrs.get('href') if hasattr(element, 'attrs') else None

if element.name in tag_types:
spans.append({
'type': tag_types[element.name],
'content': element.get_text(),
'url': url
})
elif element.string:
spans.append({
'type': 'plain',
'content': element.string,
'url': url
})

return spans

0 comments on commit 1054a4a

Please sign in to comment.