Skip to content

Commit

Permalink
feature/refactored structure_documents_by_group_and_indices (#1210)
Browse files Browse the repository at this point in the history
  • Loading branch information
gecBurton authored Nov 21, 2024
1 parent b50a89c commit 270cf1b
Showing 1 changed file with 38 additions and 25 deletions.
63 changes: 38 additions & 25 deletions redbox-core/redbox/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from langchain_core.messages import BaseMessage, AIMessage
from langchain_core.runnables import RunnableLambda

from redbox.models.chain import DocumentGroup, DocumentState, LLMCallMetadata, RedboxState, RequestMetadata
from redbox.models.chain import DocumentState, LLMCallMetadata, RedboxState, RequestMetadata
from redbox.models.graph import RedboxEventType


Expand Down Expand Up @@ -75,6 +75,27 @@ def create_group_uuid(file_name: str, indices: list[int]) -> UUID:
return uuid5(NAMESPACE_DNS, unique_str)


def create_group_uuid_for_group(documents: list[Document]) -> UUID:
"""create a uuid for a DocumentGroup"""
if not documents:
raise ValueError("at least one document is required")

file_name = documents[0].metadata["uri"]
group_indices = [d.metadata["index"] for d in documents]
return create_group_uuid(file_name, group_indices)


def documents_are_consecutive(first: Document, second: Document) -> bool:
"""are the two documents consecutive, i.e. do they appear next to each other in the original text?"""
if first.metadata["uri"] is None:
return True

if first.metadata["uri"] != second.metadata["uri"]:
return False

return first.metadata["index"] + 1 == second.metadata["index"]


def structure_documents_by_group_and_indices(docs: list[Document]) -> DocumentState:
"""Structures a list of documents by blocks of consecutive indices in group_uuids.
Expand All @@ -86,32 +107,24 @@ def structure_documents_by_group_and_indices(docs: list[Document]) -> DocumentSt
The document_uuid is taken from the Document metadata directly.
"""
result = DocumentState()
current_group = DocumentGroup()
current_group_indices: list[int] = []
current_filename: str | None = None

for d in docs:
is_not_same_filename = d.metadata["uri"] != current_filename
is_not_none_filename = current_filename is not None
is_not_consecutive = d.metadata["index"] - 1 != (
current_group_indices[-1] if current_group_indices else d.metadata["index"]
)
if (is_not_same_filename and is_not_none_filename) or (is_not_consecutive and is_not_none_filename):
# Generate a deterministic hash for the previous document and its indices
group_id = create_group_uuid(current_filename, current_group_indices)
result.groups[group_id] = current_group

current_group = DocumentGroup()
current_group_indices: list[int] = []

current_group[d.metadata["uuid"]] = d
current_group_indices.append(d.metadata["index"])
current_filename = d.metadata["uri"]
if not docs:
return result

groups = []
current_group = [docs[0]]
for last_doc, this_doc in zip(docs[:-1], docs[1:]):
if documents_are_consecutive(last_doc, this_doc):
current_group.append(this_doc)
else:
groups.append(current_group)
current_group = [this_doc]

# Handle the last group
if current_group:
group_id = create_group_uuid(current_filename, current_group_indices)
result.groups[group_id] = current_group
groups.append(current_group)

result.groups = {
create_group_uuid_for_group(group): {doc.metadata["uuid"]: doc for doc in group} for group in groups
}

return result

Expand Down

0 comments on commit 270cf1b

Please sign in to comment.