updated

onyx-dot-app · Sep 8, 2024 · 0c88c0f · 0c88c0f
1 parent e7b8728
commit 0c88c0f
Showing 1 changed file with 12 additions and 12 deletions.
diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py
@@ -254,7 +254,7 @@ def is_simple_table(table: docx.table.Table) -> bool:
 
     def extract_cell_text(cell: docx.table._Cell) -> str:
         cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
-        return ' '.join(p for p in cell_paragraphs if p)
+        return " ".join(p for p in cell_paragraphs if p) or "N/A"
 
     paragraphs = []
 
@@ -266,17 +266,17 @@ def extract_cell_text(cell: docx.table._Cell) -> str:
             if not item.rows or not is_simple_table(item):
                 continue
 
-            # We assume the first row to be the table heading
-            headings = [extract_cell_text(c).rstrip(":") for c in item.rows[0].cells]
-            for row in item.rows[1:]:
-                row_lines = []
-                for i, cell in enumerate(row.cells):
-                    # Squash cell paragraphs into a single line of text
-                    cell_text = extract_cell_text(cell)
-                    row_lines.append(f"{headings[i]}: {cell_text}" if headings[i] else cell_text)
-                paragraphs.append("\n".join(row_lines))
-
-    return TEXT_SECTION_SEPARATOR.join(paragraphs)
+            # Every row is a new line, joined with a single newline
+            table_content = "\n".join(
+                [
+                    ",\t".join(extract_cell_text(cell) for cell in row.cells)
+                    for row in item.rows
+                ]
+            )
+            paragraphs.append(table_content)
+
+    # Docx already has good spacing between paragraphs
+    return "\n".join(paragraphs)
 
 
 def pptx_to_text(file: IO[Any]) -> str: