From e7b87286414f9a04904a1e0e063064fc8fba471d Mon Sep 17 00:00:00 2001
From: Art Matsak <5328078+artmatsak@users.noreply.github.com>
Date: Mon, 17 Jun 2024 17:45:44 +0200
Subject: [PATCH 1/3] Implement indexing of simple tables in Word files

---
 .../file_processing/extract_file_text.py      | 38 ++++++++++++++++++-
 backend/requirements/default.txt              |  2 +-
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py
index 61abddf4d85..a0b36f21744 100644
--- a/backend/danswer/file_processing/extract_file_text.py
+++ b/backend/danswer/file_processing/extract_file_text.py
@@ -240,9 +240,43 @@ def read_pdf_file(
 
 
 def docx_to_text(file: IO[Any]) -> str:
+    def is_simple_table(table: docx.table.Table) -> bool:
+        for row in table.rows:
+            # No omitted cells
+            if row.grid_cols_before > 0 or row.grid_cols_after > 0:
+                return False
+
+            # No nested tables
+            if any(cell.tables for cell in row.cells):
+                return False
+
+        return True
+
+    def extract_cell_text(cell: docx.table._Cell) -> str:
+        cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
+        return ' '.join(p for p in cell_paragraphs if p)
+
+    paragraphs = []
+
     doc = docx.Document(file)
-    full_text = [para.text for para in doc.paragraphs]
-    return TEXT_SECTION_SEPARATOR.join(full_text)
+    for item in doc.iter_inner_content():
+        if isinstance(item, docx.text.paragraph.Paragraph):
+            paragraphs.append(item.text)
+        elif isinstance(item, docx.table.Table):
+            if not item.rows or not is_simple_table(item):
+                continue
+
+            # We assume the first row to be the table heading
+            headings = [extract_cell_text(c).rstrip(":") for c in item.rows[0].cells]
+            for row in item.rows[1:]:
+                row_lines = []
+                for i, cell in enumerate(row.cells):
+                    # Squash cell paragraphs into a single line of text
+                    cell_text = extract_cell_text(cell)
+                    row_lines.append(f"{headings[i]}: {cell_text}" if headings[i] else cell_text)
+                paragraphs.append("\n".join(row_lines))
+
+    return TEXT_SECTION_SEPARATOR.join(paragraphs)
 
 
 def pptx_to_text(file: IO[Any]) -> str:
diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt
index 09d58d510b8..9a3fa8bbad1 100644
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -50,7 +50,7 @@ python-pptx==0.6.23
 pypdf==3.17.0
 pytest-mock==3.12.0
 pytest-playwright==0.3.2
-python-docx==1.1.0
+python-docx==1.1.2
 python-dotenv==1.0.0
 python-multipart==0.0.7
 pywikibot==9.0.0

From 0c88c0f1c7e61cfa49bbab750dac559f7b5e08fe Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Sun, 8 Sep 2024 09:35:59 -0700
Subject: [PATCH 2/3] updated

---
 .../file_processing/extract_file_text.py      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py
index a0b36f21744..59fda2c8c2f 100644
--- a/backend/danswer/file_processing/extract_file_text.py
+++ b/backend/danswer/file_processing/extract_file_text.py
@@ -254,7 +254,7 @@ def is_simple_table(table: docx.table.Table) -> bool:
 
     def extract_cell_text(cell: docx.table._Cell) -> str:
         cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
-        return ' '.join(p for p in cell_paragraphs if p)
+        return " ".join(p for p in cell_paragraphs if p) or "N/A"
 
     paragraphs = []
 
@@ -266,17 +266,17 @@ def extract_cell_text(cell: docx.table._Cell) -> str:
             if not item.rows or not is_simple_table(item):
                 continue
 
-            # We assume the first row to be the table heading
-            headings = [extract_cell_text(c).rstrip(":") for c in item.rows[0].cells]
-            for row in item.rows[1:]:
-                row_lines = []
-                for i, cell in enumerate(row.cells):
-                    # Squash cell paragraphs into a single line of text
-                    cell_text = extract_cell_text(cell)
-                    row_lines.append(f"{headings[i]}: {cell_text}" if headings[i] else cell_text)
-                paragraphs.append("\n".join(row_lines))
-
-    return TEXT_SECTION_SEPARATOR.join(paragraphs)
+            # Every row is a new line, joined with a single newline
+            table_content = "\n".join(
+                [
+                    ",\t".join(extract_cell_text(cell) for cell in row.cells)
+                    for row in item.rows
+                ]
+            )
+            paragraphs.append(table_content)
+
+    # Docx already has good spacing between paragraphs
+    return "\n".join(paragraphs)
 
 
 def pptx_to_text(file: IO[Any]) -> str:

From b9a48ec3f884432fff2af5c7d589fb147b05c627 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Sun, 8 Sep 2024 09:38:20 -0700
Subject: [PATCH 3/3] k

---
 backend/danswer/file_processing/extract_file_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py
index 59fda2c8c2f..36df08ac465 100644
--- a/backend/danswer/file_processing/extract_file_text.py
+++ b/backend/danswer/file_processing/extract_file_text.py
@@ -257,11 +257,11 @@ def extract_cell_text(cell: docx.table._Cell) -> str:
         return " ".join(p for p in cell_paragraphs if p) or "N/A"
 
     paragraphs = []
-
     doc = docx.Document(file)
     for item in doc.iter_inner_content():
         if isinstance(item, docx.text.paragraph.Paragraph):
             paragraphs.append(item.text)
+
         elif isinstance(item, docx.table.Table):
             if not item.rows or not is_simple_table(item):
                 continue