From e7b87286414f9a04904a1e0e063064fc8fba471d Mon Sep 17 00:00:00 2001 From: Art Matsak <5328078+artmatsak@users.noreply.github.com> Date: Mon, 17 Jun 2024 17:45:44 +0200 Subject: [PATCH 1/3] Implement indexing of simple tables in Word files --- .../file_processing/extract_file_text.py | 38 ++++++++++++++++++- backend/requirements/default.txt | 2 +- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index 61abddf4d85..a0b36f21744 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -240,9 +240,43 @@ def read_pdf_file( def docx_to_text(file: IO[Any]) -> str: + def is_simple_table(table: docx.table.Table) -> bool: + for row in table.rows: + # No omitted cells + if row.grid_cols_before > 0 or row.grid_cols_after > 0: + return False + + # No nested tables + if any(cell.tables for cell in row.cells): + return False + + return True + + def extract_cell_text(cell: docx.table._Cell) -> str: + cell_paragraphs = [para.text.strip() for para in cell.paragraphs] + return ' '.join(p for p in cell_paragraphs if p) + + paragraphs = [] + doc = docx.Document(file) - full_text = [para.text for para in doc.paragraphs] - return TEXT_SECTION_SEPARATOR.join(full_text) + for item in doc.iter_inner_content(): + if isinstance(item, docx.text.paragraph.Paragraph): + paragraphs.append(item.text) + elif isinstance(item, docx.table.Table): + if not item.rows or not is_simple_table(item): + continue + + # We assume the first row to be the table heading + headings = [extract_cell_text(c).rstrip(":") for c in item.rows[0].cells] + for row in item.rows[1:]: + row_lines = [] + for i, cell in enumerate(row.cells): + # Squash cell paragraphs into a single line of text + cell_text = extract_cell_text(cell) + row_lines.append(f"{headings[i]}: {cell_text}" if headings[i] else cell_text) + paragraphs.append("\n".join(row_lines)) + + return TEXT_SECTION_SEPARATOR.join(paragraphs) def pptx_to_text(file: IO[Any]) -> str: diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 09d58d510b8..9a3fa8bbad1 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -50,7 +50,7 @@ python-pptx==0.6.23 pypdf==3.17.0 pytest-mock==3.12.0 pytest-playwright==0.3.2 -python-docx==1.1.0 +python-docx==1.1.2 python-dotenv==1.0.0 python-multipart==0.0.7 pywikibot==9.0.0 From 0c88c0f1c7e61cfa49bbab750dac559f7b5e08fe Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sun, 8 Sep 2024 09:35:59 -0700 Subject: [PATCH 2/3] updated --- .../file_processing/extract_file_text.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index a0b36f21744..59fda2c8c2f 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -254,7 +254,7 @@ def is_simple_table(table: docx.table.Table) -> bool: def extract_cell_text(cell: docx.table._Cell) -> str: cell_paragraphs = [para.text.strip() for para in cell.paragraphs] - return ' '.join(p for p in cell_paragraphs if p) + return " ".join(p for p in cell_paragraphs if p) or "N/A" paragraphs = [] @@ -266,17 +266,17 @@ def extract_cell_text(cell: docx.table._Cell) -> str: if not item.rows or not is_simple_table(item): continue - # We assume the first row to be the table heading - headings = [extract_cell_text(c).rstrip(":") for c in item.rows[0].cells] - for row in item.rows[1:]: - row_lines = [] - for i, cell in enumerate(row.cells): - # Squash cell paragraphs into a single line of text - cell_text = extract_cell_text(cell) - row_lines.append(f"{headings[i]}: {cell_text}" if headings[i] else cell_text) - paragraphs.append("\n".join(row_lines)) - - return TEXT_SECTION_SEPARATOR.join(paragraphs) + # Every row is a new line, joined with a single newline + table_content = "\n".join( + [ + ",\t".join(extract_cell_text(cell) for cell in row.cells) + for row in item.rows + ] + ) + paragraphs.append(table_content) + + # Docx already has good spacing between paragraphs + return "\n".join(paragraphs) def pptx_to_text(file: IO[Any]) -> str: From b9a48ec3f884432fff2af5c7d589fb147b05c627 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sun, 8 Sep 2024 09:38:20 -0700 Subject: [PATCH 3/3] k --- backend/danswer/file_processing/extract_file_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index 59fda2c8c2f..36df08ac465 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -257,11 +257,11 @@ def extract_cell_text(cell: docx.table._Cell) -> str: return " ".join(p for p in cell_paragraphs if p) or "N/A" paragraphs = [] - doc = docx.Document(file) for item in doc.iter_inner_content(): if isinstance(item, docx.text.paragraph.Paragraph): paragraphs.append(item.text) + elif isinstance(item, docx.table.Table): if not item.rows or not is_simple_table(item): continue