Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metadata to pdf extraction #2278

Merged
merged 1 commit into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backend/danswer/connectors/file/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from danswer.file_processing.extract_file_text import get_file_ext
from danswer.file_processing.extract_file_text import is_text_file_extension
from danswer.file_processing.extract_file_text import load_files_from_zip
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import read_pdf_file
from danswer.file_processing.extract_file_text import read_text_file
from danswer.file_store.file_store import get_default_file_store
from danswer.utils.logger import setup_logger
Expand Down Expand Up @@ -75,7 +75,7 @@ def _process_file(

# Using the PDF reader function directly to pass in password cleanly
elif extension == ".pdf":
file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass)
file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass)

else:
file_content_raw = extract_file_text(
Expand Down
5 changes: 3 additions & 2 deletions backend/danswer/connectors/google_drive/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import pptx_to_text
from danswer.file_processing.extract_file_text import read_pdf_file
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger

Expand Down Expand Up @@ -334,7 +334,8 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
return docx_to_text(file=io.BytesIO(response))
elif mime_type == GDriveMimeType.PDF.value:
response = service.files().get_media(fileId=file["id"]).execute()
return pdf_to_text(file=io.BytesIO(response))
text, _ = read_pdf_file(file=io.BytesIO(response))
return text
elif mime_type == GDriveMimeType.POWERPOINT.value:
response = service.files().get_media(fileId=file["id"]).execute()
return pptx_to_text(file=io.BytesIO(response))
Expand Down
8 changes: 5 additions & 3 deletions backend/danswer/connectors/web/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import read_pdf_file
from danswer.file_processing.html_utils import web_html_cleanup
from danswer.utils.logger import setup_logger
from danswer.utils.sitemap import list_pages_for_site
Expand Down Expand Up @@ -284,15 +284,17 @@ def load_from_state(self) -> GenerateDocumentsOutput:
if current_url.split(".")[-1] == "pdf":
# PDF files are not checked for links
response = requests.get(current_url)
page_text = pdf_to_text(file=io.BytesIO(response.content))
page_text, metadata = read_pdf_file(
file=io.BytesIO(response.content)
)

doc_batch.append(
Document(
id=current_url,
sections=[Section(link=current_url, text=page_text)],
source=DocumentSource.WEB,
semantic_identifier=current_url.split("/")[-1],
metadata={},
metadata=metadata,
)
)
continue
Expand Down
22 changes: 17 additions & 5 deletions backend/danswer/file_processing/extract_file_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,11 @@ def read_text_file(
return file_content_raw, metadata


def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
def read_pdf_file(
file: IO[Any],
pdf_pass: str | None = None,
) -> str:
metadata = {}
try:
pdf_reader = PdfReader(file)

Expand All @@ -197,8 +201,16 @@ def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
# can be discoverable by title.
return ""

return TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
# Extract metadata from the PDF, removing leading '/' from keys if present
# This standardizes the metadata keys for consistency
metadata = {
k[1:] if k.startswith("/") else k: v for k, v in pdf_reader.metadata.items()
}
return (
TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
),
metadata,
)
except PdfStreamError:
logger.exception("PDF file is not a valid PDF")
Expand All @@ -207,7 +219,7 @@ def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:

# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return ""
return "", metadata


def docx_to_text(file: IO[Any]) -> str:
Expand Down Expand Up @@ -273,7 +285,7 @@ def extract_file_text(
break_on_unprocessable: bool = True,
) -> str:
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
".pdf": pdf_to_text,
".pdf": read_pdf_file,
".docx": docx_to_text,
".pptx": pptx_to_text,
".xlsx": xlsx_to_text,
Expand Down
Loading