Skip to content

Commit

Permalink
Fix converter for multi-page response
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval committed Nov 13, 2024
1 parent 0db2f94 commit b667dc0
Showing 1 changed file with 12 additions and 8 deletions.
20 changes: 12 additions & 8 deletions textractor/utils/legacy_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@

def converter(response):
blocks_to_delete = []
page_block = None
page_blocks = []
try:
for i, block in enumerate(response["Blocks"]):
if block.get("BlockType") == "PAGE":
page_block = block
page_blocks.append(block)
elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"):
block["BlockType"] = LAYOUT_TEXT
elif (
Expand All @@ -40,15 +40,19 @@ def converter(response):
elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []):
blocks_to_delete.append((i, block))

page_relationships = []
for relationship in page_block.get("Relationships", []):
if relationship["Type"] == "CHILD":
page_relationships = relationship["Ids"]
break
blocks_to_delete_id_set = set([b["Id"] for _, b in blocks_to_delete])
for page_block in page_blocks:
for relationship in page_block.get("Relationships", []):
if relationship["Type"] == "CHILD":
relationship["Ids"] = [
id
for id in relationship["Ids"]
if id not in blocks_to_delete_id_set
]
break

for i, block in blocks_to_delete[::-1]:
del response["Blocks"][i]
page_relationships.remove(block["Id"])
except Exception as ex:
logging.warning(f"Failed to convert the response for backward compatibility. {str(ex)}")

Expand Down

0 comments on commit b667dc0

Please sign in to comment.