From 84133868fee9f314a0dd6fba7e188f9d3cf51506 Mon Sep 17 00:00:00 2001
From: yihong0618 <zouzou0208@gmail.com>
Date: Fri, 29 Nov 2024 18:18:50 +0800
Subject: [PATCH 1/2] fix: excel in node only read one sheet, close #9661

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 .../workflow/nodes/document_extractor/node.py | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py
index d963241f07bc43..bb8d7f56db59ee 100644
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -4,8 +4,8 @@
 
 import docx
 import pandas as pd
-import pypdfium2
-import yaml
+import pypdfium2  # type: ignore
+import yaml  # type: ignore
 from unstructured.partition.api import partition_via_api
 from unstructured.partition.email import partition_email
 from unstructured.partition.epub import partition_epub
@@ -236,16 +236,17 @@ def _extract_text_from_csv(file_content: bytes) -> str:
 
 
 def _extract_text_from_excel(file_content: bytes) -> str:
-    """Extract text from an Excel file using pandas."""
-
     try:
-        df = pd.read_excel(io.BytesIO(file_content))
-
-        # Drop rows where all elements are NaN
-        df.dropna(how="all", inplace=True)
-
-        # Convert DataFrame to Markdown table
-        markdown_table = df.to_markdown(index=False)
+        excel_file = pd.ExcelFile(io.BytesIO(file_content))
+        markdown_table = ""
+        for sheet_name in excel_file.sheet_names:
+            try:
+                df = excel_file.parse(sheet_name=sheet_name)
+                df.dropna(how="all", inplace=True)
+                # Create Markdown table two times to separate tables with a newline
+                markdown_table += df.to_markdown(index=False) + "\n\n"
+            except Exception as e:
+                continue
         return markdown_table
     except Exception as e:
         raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e

From 8be7e63886c9c5c85322226d729a42cee2a6121b Mon Sep 17 00:00:00 2001
From: yihong0618 <zouzou0208@gmail.com>
Date: Fri, 29 Nov 2024 18:24:27 +0800
Subject: [PATCH 2/2] fix: bring back comments

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 api/core/workflow/nodes/document_extractor/node.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py
index bb8d7f56db59ee..d490a2eb03aff9 100644
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -236,6 +236,7 @@ def _extract_text_from_csv(file_content: bytes) -> str:
 
 
 def _extract_text_from_excel(file_content: bytes) -> str:
+    """Extract text from an Excel file using pandas."""
     try:
         excel_file = pd.ExcelFile(io.BytesIO(file_content))
         markdown_table = ""