From 84133868fee9f314a0dd6fba7e188f9d3cf51506 Mon Sep 17 00:00:00 2001 From: yihong0618 Date: Fri, 29 Nov 2024 18:18:50 +0800 Subject: [PATCH 1/2] fix: excel in node only read one sheet, close #9661 Signed-off-by: yihong0618 --- .../workflow/nodes/document_extractor/node.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index d963241f07bc43..bb8d7f56db59ee 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -4,8 +4,8 @@ import docx import pandas as pd -import pypdfium2 -import yaml +import pypdfium2 # type: ignore +import yaml # type: ignore from unstructured.partition.api import partition_via_api from unstructured.partition.email import partition_email from unstructured.partition.epub import partition_epub @@ -236,16 +236,17 @@ def _extract_text_from_csv(file_content: bytes) -> str: def _extract_text_from_excel(file_content: bytes) -> str: - """Extract text from an Excel file using pandas.""" - try: - df = pd.read_excel(io.BytesIO(file_content)) - - # Drop rows where all elements are NaN - df.dropna(how="all", inplace=True) - - # Convert DataFrame to Markdown table - markdown_table = df.to_markdown(index=False) + excel_file = pd.ExcelFile(io.BytesIO(file_content)) + markdown_table = "" + for sheet_name in excel_file.sheet_names: + try: + df = excel_file.parse(sheet_name=sheet_name) + df.dropna(how="all", inplace=True) + # Create Markdown table two times to separate tables with a newline + markdown_table += df.to_markdown(index=False) + "\n\n" + except Exception as e: + continue return markdown_table except Exception as e: raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e From 8be7e63886c9c5c85322226d729a42cee2a6121b Mon Sep 17 00:00:00 2001 From: yihong0618 Date: Fri, 29 Nov 2024 18:24:27 +0800 Subject: [PATCH 2/2] fix: bring back comments Signed-off-by: yihong0618 --- api/core/workflow/nodes/document_extractor/node.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index bb8d7f56db59ee..d490a2eb03aff9 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -236,6 +236,7 @@ def _extract_text_from_csv(file_content: bytes) -> str: def _extract_text_from_excel(file_content: bytes) -> str: + """Extract text from an Excel file using pandas.""" try: excel_file = pd.ExcelFile(io.BytesIO(file_content)) markdown_table = ""