From 671357a14c755b1be66cf7e63ea32e8c994ff94d Mon Sep 17 00:00:00 2001 From: Zixuan Cheng <61724187+Theysua@users.noreply.github.com> Date: Sat, 26 Oct 2024 04:49:17 -0700 Subject: [PATCH 1/4] Provide .json support for file extractor. --- api/core/workflow/nodes/document_extractor/node.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index b4ffee1f13c724..f5838efee5edd3 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -103,6 +103,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str: return _extract_text_from_eml(file_content) elif mime_type == "application/vnd.ms-outlook": return _extract_text_from_msg(file_content) + elif mime_type == "application/json": + return _extract_text_from_json(file_content) else: raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}") @@ -112,6 +114,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) match file_extension: case ".txt" | ".markdown" | ".md" | ".html" | ".htm" | ".xml": return _extract_text_from_plain_text(file_content) + case ".json": + return _extract_text_from_json(file_content) case ".pdf": return _extract_text_from_pdf(file_content) case ".doc" | ".docx": @@ -140,6 +144,12 @@ def _extract_text_from_plain_text(file_content: bytes) -> str: except UnicodeDecodeError as e: raise TextExtractionError("Failed to decode plain text file") from e +def _extract_text_from_json(file_content: bytes) -> str: + try: + json_data = json.loads(file_content.decode("utf-8")) + return json.dumps(json_data, indent=2, ensure_ascii=False) + except (UnicodeDecodeError, json.JSONDecodeError) as e: + raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e def _extract_text_from_pdf(file_content: bytes) -> str: try: From 0014db749d9851ce2cfaba4828ddd62a5c5ba4ed Mon Sep 17 00:00:00 2001 From: Zixuan Cheng <61724187+Theysua@users.noreply.github.com> Date: Sat, 26 Oct 2024 04:56:18 -0700 Subject: [PATCH 2/4] support .json in document_extractor --- api/core/workflow/nodes/document_extractor/node.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index f5838efee5edd3..78808a8d3f94f2 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -144,6 +144,7 @@ def _extract_text_from_plain_text(file_content: bytes) -> str: except UnicodeDecodeError as e: raise TextExtractionError("Failed to decode plain text file") from e + def _extract_text_from_json(file_content: bytes) -> str: try: json_data = json.loads(file_content.decode("utf-8")) @@ -151,6 +152,7 @@ def _extract_text_from_json(file_content: bytes) -> str: except (UnicodeDecodeError, json.JSONDecodeError) as e: raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e + def _extract_text_from_pdf(file_content: bytes) -> str: try: pdf_file = io.BytesIO(file_content) From 94673cb16adae9161bc56beaa0540f0cbcd4762e Mon Sep 17 00:00:00 2001 From: -LAN- Date: Sat, 26 Oct 2024 20:17:16 +0800 Subject: [PATCH 3/4] refactor(api): streamline MIME type handling in document extractor --- .../workflow/nodes/document_extractor/node.py | 64 +++++++++---------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 78808a8d3f94f2..cf30ea75ab5e8d 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -1,5 +1,6 @@ import csv import io +import json import docx import pandas as pd @@ -77,36 +78,31 @@ def _run(self): def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str: """Extract text from a file based on its MIME type.""" - if mime_type.startswith("text/plain") or mime_type in {"text/html", "text/htm", "text/markdown", "text/xml"}: - return _extract_text_from_plain_text(file_content) - elif mime_type == "application/pdf": - return _extract_text_from_pdf(file_content) - elif mime_type in { - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/msword", - }: - return _extract_text_from_doc(file_content) - elif mime_type == "text/csv": - return _extract_text_from_csv(file_content) - elif mime_type in { - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "application/vnd.ms-excel", - }: - return _extract_text_from_excel(file_content) - elif mime_type == "application/vnd.ms-powerpoint": - return _extract_text_from_ppt(file_content) - elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation": - return _extract_text_from_pptx(file_content) - elif mime_type == "application/epub+zip": - return _extract_text_from_epub(file_content) - elif mime_type == "message/rfc822": - return _extract_text_from_eml(file_content) - elif mime_type == "application/vnd.ms-outlook": - return _extract_text_from_msg(file_content) - elif mime_type == "application/json": - return _extract_text_from_json(file_content) - else: - raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}") + match mime_type: + case "text/plain" | "text/html" | "text/htm" | "text/markdown" | "text/xml": + return _extract_text_from_plain_text(file_content) + case "application/pdf": + return _extract_text_from_pdf(file_content) + case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword": + return _extract_text_from_doc(file_content) + case "text/csv": + return _extract_text_from_csv(file_content) + case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel": + return _extract_text_from_excel(file_content) + case "application/vnd.ms-powerpoint": + return _extract_text_from_ppt(file_content) + case "application/vnd.openxmlformats-officedocument.presentationml.presentation": + return _extract_text_from_pptx(file_content) + case "application/epub+zip": + return _extract_text_from_epub(file_content) + case "message/rfc822": + return _extract_text_from_eml(file_content) + case "application/vnd.ms-outlook": + return _extract_text_from_msg(file_content) + case "application/json": + return _extract_text_from_json(file_content) + case _: + raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}") def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) -> str: @@ -195,13 +191,13 @@ def _download_file_content(file: File) -> bytes: def _extract_text_from_file(file: File): - if file.mime_type is None: - raise UnsupportedFileTypeError("Unable to determine file type: MIME type is missing") file_content = _download_file_content(file) - if file.transfer_method == FileTransferMethod.REMOTE_URL: + if file.extension is not None: + extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension) + elif file.mime_type is not None: extracted_text = _extract_text_by_mime_type(file_content=file_content, mime_type=file.mime_type) else: - extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension) + raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing") return extracted_text From f8f4796c053c0f9e2f47a456e907af146e3b6706 Mon Sep 17 00:00:00 2001 From: -LAN- Date: Sat, 26 Oct 2024 20:23:52 +0800 Subject: [PATCH 4/4] fix(api): improve file type checks in document extractor --- api/core/workflow/nodes/document_extractor/node.py | 4 ++-- .../core/workflow/nodes/test_document_extractor_node.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index cf30ea75ab5e8d..9e09b6d29aeb7c 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -192,9 +192,9 @@ def _download_file_content(file: File) -> bytes: def _extract_text_from_file(file: File): file_content = _download_file_content(file) - if file.extension is not None: + if file.extension: extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension) - elif file.mime_type is not None: + elif file.mime_type: extracted_text = _extract_text_by_mime_type(file_content=file_content, mime_type=file.mime_type) else: raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing") diff --git a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py index a141fa9a13d688..4f1f8f05c8ea92 100644 --- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py +++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py @@ -125,7 +125,7 @@ def test_run_extract_text( result = document_extractor_node._run() assert isinstance(result, NodeRunResult) - assert result.status == WorkflowNodeExecutionStatus.SUCCEEDED + assert result.status == WorkflowNodeExecutionStatus.SUCCEEDED, result.error assert result.outputs is not None assert result.outputs["text"] == expected_text