Skip to content

Commit

Permalink
updated
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 committed Sep 8, 2024
1 parent e7b8728 commit 0c88c0f
Showing 1 changed file with 12 additions and 12 deletions.
24 changes: 12 additions & 12 deletions backend/danswer/file_processing/extract_file_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def is_simple_table(table: docx.table.Table) -> bool:

def extract_cell_text(cell: docx.table._Cell) -> str:
cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
return ' '.join(p for p in cell_paragraphs if p)
return " ".join(p for p in cell_paragraphs if p) or "N/A"

paragraphs = []

Expand All @@ -266,17 +266,17 @@ def extract_cell_text(cell: docx.table._Cell) -> str:
if not item.rows or not is_simple_table(item):
continue

# We assume the first row to be the table heading
headings = [extract_cell_text(c).rstrip(":") for c in item.rows[0].cells]
for row in item.rows[1:]:
row_lines = []
for i, cell in enumerate(row.cells):
# Squash cell paragraphs into a single line of text
cell_text = extract_cell_text(cell)
row_lines.append(f"{headings[i]}: {cell_text}" if headings[i] else cell_text)
paragraphs.append("\n".join(row_lines))

return TEXT_SECTION_SEPARATOR.join(paragraphs)
# Every row is a new line, joined with a single newline
table_content = "\n".join(
[
",\t".join(extract_cell_text(cell) for cell in row.cells)
for row in item.rows
]
)
paragraphs.append(table_content)

# Docx already has good spacing between paragraphs
return "\n".join(paragraphs)


def pptx_to_text(file: IO[Any]) -> str:
Expand Down

0 comments on commit 0c88c0f

Please sign in to comment.