Skip to content

Commit

Permalink
feat: Pass predicted page-headers and page-footers through to Docling…
Browse files Browse the repository at this point in the history
…Document furniture

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
  • Loading branch information
cau-git committed Jan 13, 2025
1 parent 1976584 commit 5c681ba
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 2 deletions.
62 changes: 60 additions & 2 deletions docling/models/ds_glm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
from typing import List, Union

from deepsearch_glm.andromeda_nlp import nlp_model
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
)
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import (
Figure,
Expand Down Expand Up @@ -71,12 +76,15 @@ def _to_legacy_document(self, conv_res) -> DsDocument:
)

main_text: List[Union[Ref, BaseText]] = []
page_headers: List[Union[Ref, BaseText]] = []
page_footers: List[Union[Ref, BaseText]] = []

tables: List[DsSchemaTable] = []
figures: List[Figure] = []

page_no_to_page = {p.page_no: p for p in conv_res.pages}

for element in conv_res.assembled.elements:
for element in conv_res.assembled.body:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
Expand Down Expand Up @@ -238,6 +246,53 @@ def make_spans(cell):
)
)

# We can throw in headers and footers at the end of the legacy doc
# since the reading-order will re-sort it later.
for element in conv_res.assembled.headers:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
)

if isinstance(element, TextElement):

tel = BaseText(
text=element.text,
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, len(element.text)],
)
],
)
if element.label == DocItemLabel.PAGE_HEADER:
index = len(page_headers)
ref_str = f"#/page-headers/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
page_headers.append(tel)
elif element.label == DocItemLabel.PAGE_FOOTER:
index = len(page_footers)
ref_str = f"#/page-footers/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
page_footers.append(tel)

page_dimensions = [
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in conv_res.pages
Expand All @@ -252,6 +307,8 @@ def make_spans(cell):
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
page_headers=page_headers,
page_footers=page_footers,
)

return ds_doc
Expand All @@ -264,6 +321,7 @@ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
glm_doc = self.model.apply_on_doc(ds_doc_dict)

docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
1 == 1

# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
Expand Down
9 changes: 9 additions & 0 deletions docling/utils/glm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
current_list = None

doc.add_heading(text=text, prov=prov)
elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
current_list = None

doc.add_text(
label=DocItemLabel(name_label),
text=text,
prov=prov,
parent=doc.furniture,
)
else:
current_list = None

Expand Down

0 comments on commit 5c681ba

Please sign in to comment.