From 4181124e7afeeddad281024f3dcabf5d18b83e17 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Fri, 30 Aug 2024 15:14:02 -0700 Subject: [PATCH] add metadata to pdf extraction (#2278) --- backend/danswer/connectors/file/connector.py | 4 ++-- .../connectors/google_drive/connector.py | 5 +++-- backend/danswer/connectors/web/connector.py | 8 ++++--- .../file_processing/extract_file_text.py | 22 ++++++++++++++----- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 6c5501734b0..83d0af2c12e 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -23,7 +23,7 @@ from danswer.file_processing.extract_file_text import get_file_ext from danswer.file_processing.extract_file_text import is_text_file_extension from danswer.file_processing.extract_file_text import load_files_from_zip -from danswer.file_processing.extract_file_text import pdf_to_text +from danswer.file_processing.extract_file_text import read_pdf_file from danswer.file_processing.extract_file_text import read_text_file from danswer.file_store.file_store import get_default_file_store from danswer.utils.logger import setup_logger @@ -75,7 +75,7 @@ def _process_file( # Using the PDF reader function directly to pass in password cleanly elif extension == ".pdf": - file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass) + file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass) else: file_content_raw = extract_file_text( diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 40a9b73432f..90e5b0ed5db 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -41,8 +41,8 @@ from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.file_processing.extract_file_text import docx_to_text -from danswer.file_processing.extract_file_text import pdf_to_text from danswer.file_processing.extract_file_text import pptx_to_text +from danswer.file_processing.extract_file_text import read_pdf_file from danswer.utils.batching import batch_generator from danswer.utils.logger import setup_logger @@ -334,7 +334,8 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: return docx_to_text(file=io.BytesIO(response)) elif mime_type == GDriveMimeType.PDF.value: response = service.files().get_media(fileId=file["id"]).execute() - return pdf_to_text(file=io.BytesIO(response)) + text, _ = read_pdf_file(file=io.BytesIO(response)) + return text elif mime_type == GDriveMimeType.POWERPOINT.value: response = service.files().get_media(fileId=file["id"]).execute() return pptx_to_text(file=io.BytesIO(response)) diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 6e76e404acd..a98d4dcf84a 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -27,7 +27,7 @@ from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import Document from danswer.connectors.models import Section -from danswer.file_processing.extract_file_text import pdf_to_text +from danswer.file_processing.extract_file_text import read_pdf_file from danswer.file_processing.html_utils import web_html_cleanup from danswer.utils.logger import setup_logger from danswer.utils.sitemap import list_pages_for_site @@ -284,7 +284,9 @@ def load_from_state(self) -> GenerateDocumentsOutput: if current_url.split(".")[-1] == "pdf": # PDF files are not checked for links response = requests.get(current_url) - page_text = pdf_to_text(file=io.BytesIO(response.content)) + page_text, metadata = read_pdf_file( + file=io.BytesIO(response.content) + ) doc_batch.append( Document( @@ -292,7 +294,7 @@ def load_from_state(self) -> GenerateDocumentsOutput: sections=[Section(link=current_url, text=page_text)], source=DocumentSource.WEB, semantic_identifier=current_url.split("/")[-1], - metadata={}, + metadata=metadata, ) ) continue diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index 7143b428714..eeb7b5ae8ba 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -177,7 +177,11 @@ def read_text_file( return file_content_raw, metadata -def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str: +def read_pdf_file( + file: IO[Any], + pdf_pass: str | None = None, +) -> str: + metadata = {} try: pdf_reader = PdfReader(file) @@ -197,8 +201,16 @@ def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str: # can be discoverable by title. return "" - return TEXT_SECTION_SEPARATOR.join( - page.extract_text() for page in pdf_reader.pages + # Extract metadata from the PDF, removing leading '/' from keys if present + # This standardizes the metadata keys for consistency + metadata = { + k[1:] if k.startswith("/") else k: v for k, v in pdf_reader.metadata.items() + } + return ( + TEXT_SECTION_SEPARATOR.join( + page.extract_text() for page in pdf_reader.pages + ), + metadata, ) except PdfStreamError: logger.exception("PDF file is not a valid PDF") @@ -207,7 +219,7 @@ def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str: # File is still discoverable by title # but the contents are not included as they cannot be parsed - return "" + return "", metadata def docx_to_text(file: IO[Any]) -> str: @@ -273,7 +285,7 @@ def extract_file_text( break_on_unprocessable: bool = True, ) -> str: extension_to_function: dict[str, Callable[[IO[Any]], str]] = { - ".pdf": pdf_to_text, + ".pdf": read_pdf_file, ".docx": docx_to_text, ".pptx": pptx_to_text, ".xlsx": xlsx_to_text,