From 2641e6e4f3cac09a615c2e269d9be633ac6dfbb1 Mon Sep 17 00:00:00 2001 From: "Aivin V. Solatorio" Date: Fri, 16 Jun 2023 00:23:34 -0400 Subject: [PATCH] Fix schema access to title (#14) Signed-off-by: Aivin V. Solatorio --- llm4data/schema/schema2info.py | 2 +- llm4data/scripts/indexing/docs/docs.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llm4data/schema/schema2info.py b/llm4data/schema/schema2info.py index f402385..7c68786 100644 --- a/llm4data/schema/schema2info.py +++ b/llm4data/schema/schema2info.py @@ -3,7 +3,7 @@ def get_doc_title(metadata: dict) -> str: """Get the title of the document from the metadata document_description.""" - return metadata["title_statement"]["title"] + return metadata["document_description"]["title_statement"]["title"] def get_doc_authors(metadata: dict) -> list: diff --git a/llm4data/scripts/indexing/docs/docs.py b/llm4data/scripts/indexing/docs/docs.py index 1b70b83..c9b8c17 100644 --- a/llm4data/scripts/indexing/docs/docs.py +++ b/llm4data/scripts/indexing/docs/docs.py @@ -11,6 +11,7 @@ from llm4data.embeddings.docs import get_docs_embeddings from llm4data import index from llm4data import configs +from llm4data.schema.schema2info import get_doc_title # Get the docs embeddings docs_embeddings = get_docs_embeddings() @@ -39,7 +40,7 @@ def add_pdf_document(path: Union[str, Path], metadata: Optional[dict] = None): if len(documents): # Index the title of the document documents.append( - Document(page_content=metadata["title"], metadata=documents[0].metadata) + Document(page_content=get_doc_title(metadata), metadata=documents[0].metadata) ) for doc in documents: