From bfde8574204d1111309fbbff0b4347a3bf518676 Mon Sep 17 00:00:00 2001 From: KennyZhang1 <90438893+KennyZhang1@users.noreply.github.com> Date: Fri, 24 Jan 2025 17:09:32 -0500 Subject: [PATCH 1/5] Add support for conversion via Document Intelligence (#303) * added cli params for doc intel * added DocumentIntelligenceConverter class implementation * initialized doc intel client instance field * added isolated doc_intel main conversion function * temp fix for ContentFormat import bug * ran tests for docintel and offline for many filetypes * push doc intel converter to the top of the stack * formatting changes * modified project toml file --- pyproject.toml | 2 + src/markitdown/__main__.py | 33 ++++++++++--- src/markitdown/_markitdown.py | 88 +++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9c113ade..2a4e203e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,8 @@ dependencies = [ "pathvalidate", "charset-normalizer", "openai", + "azure-ai-documentintelligence", + "azure-identity" ] [project.urls] diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index b6cf963b..69e8f0ea 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -4,8 +4,8 @@ import argparse import sys from textwrap import dedent -from .__about__ import __version__ -from ._markitdown import MarkItDown, DocumentConverterResult +from __about__ import __version__ +from _markitdown import MarkItDown, DocumentConverterResult def main(): @@ -57,16 +57,37 @@ def main(): "--output", help="Output file name. If not provided, output is written to stdout.", ) + parser.add_argument( + "-d", + "--use-docintel", + action="store_true", + help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", + ) + parser.add_argument( + "-e", + "--endpoint", + type=str, + help="Document Intelligence Endpoint. Required if using Document Intelligence.", + ) args = parser.parse_args() - if args.filename is None: + if args.use_docintel: + if args.endpoint is None: + raise ValueError( + "Document Intelligence Endpoint is required when using Document Intelligence." + ) + elif args.filename is None: + raise ValueError("Filename is required when using Document Intelligence.") + markitdown = MarkItDown(docintel_endpoint=args.endpoint) + else: markitdown = MarkItDown() + + if args.filename is None: result = markitdown.convert_stream(sys.stdin.buffer) - _handle_output(args, result) else: - markitdown = MarkItDown() result = markitdown.convert(args.filename) - _handle_output(args, result) + + _handle_output(args, result) def _handle_output(args, result: DocumentConverterResult): diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 33806e13..ae6a7b4f 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -33,6 +33,19 @@ from bs4 import BeautifulSoup from charset_normalizer import from_path +# Azure imports +from azure.ai.documentintelligence import DocumentIntelligenceClient +from azure.ai.documentintelligence.models import ( + AnalyzeDocumentRequest, + AnalyzeResult, + DocumentAnalysisFeature, +) +from azure.identity import DefaultAzureCredential + +# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. +# This constant is a temporary fix until the bug is resolved. +CONTENT_FORMAT = "markdown" + # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False try: @@ -1318,6 +1331,74 @@ def convert( ) +class DocumentIntelligenceConverter(DocumentConverter): + """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" + + def __init__( + self, + endpoint: str, + api_version: str = "2024-07-31-preview", + ): + self.endpoint = endpoint + self.api_version = api_version + self.doc_intel_client = DocumentIntelligenceClient( + endpoint=self.endpoint, + api_version=self.api_version, + credential=DefaultAzureCredential(), + ) + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if extension is not supported by Document Intelligence + extension = kwargs.get("file_extension", "") + docintel_extensions = [ + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".html", + ".jpeg", + ".jpg", + ".png", + ".bmp", + ".tiff", + ".heif", + ] + if extension.lower() not in docintel_extensions: + return None + + # Get the bytestring for the local path + with open(local_path, "rb") as f: + file_bytes = f.read() + + # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) + if extension.lower() in [".xlsx", ".pptx", ".html"]: + analysis_features = [] + else: + analysis_features = [ + DocumentAnalysisFeature.FORMULAS, # enable formula extraction + DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR + DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction + ] + + # Extract the text using Azure Document Intelligence + poller = self.doc_intel_client.begin_analyze_document( + model_id="prebuilt-layout", + body=AnalyzeDocumentRequest(bytes_source=file_bytes), + features=analysis_features, + output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed + ) + result: AnalyzeResult = poller.result() + + # remove comments from the markdown content generated by Doc Intelligence and append to markdown string + markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) + return DocumentConverterResult( + title=None, + text_content=markdown_text, + ) + + class FileConversionException(BaseException): pass @@ -1337,6 +1418,7 @@ def __init__( llm_model: Optional[str] = None, style_map: Optional[str] = None, exiftool_path: Optional[str] = None, + docintel_endpoint: Optional[str] = None, # Deprecated mlm_client: Optional[Any] = None, mlm_model: Optional[str] = None, @@ -1406,6 +1488,12 @@ def __init__( self.register_page_converter(ZipConverter()) self.register_page_converter(OutlookMsgConverter()) + # Register Document Intelligence converter at the top of the stack if endpoint is provided + if docintel_endpoint is not None: + self.register_page_converter( + DocumentIntelligenceConverter(endpoint=docintel_endpoint) + ) + def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any ) -> DocumentConverterResult: # TODO: deal with kwargs From bf6a15e9b5eb89820bf82c04cbe934bf62fb8617 Mon Sep 17 00:00:00 2001 From: KennyZhang1 <90438893+KennyZhang1@users.noreply.github.com> Date: Sat, 1 Feb 2025 01:23:26 -0500 Subject: [PATCH 2/5] Kennyzhang/docintel docs (#312) * updated docs to include doc intelligence * include reference to doc intel setup docs --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 6bc91e6c..76a4d3f1 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,20 @@ Or use `-o` to specify the output file: markitdown path-to-file.pdf -o document.md ``` +To use Document Intelligence conversion: + +```bash +markitdown path-to-file.pdf -o document.md -d -e "" +``` + You can also pipe content: ```bash cat path-to-file.pdf | markitdown ``` +More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0) + ### Python API Basic usage in Python: @@ -51,6 +59,16 @@ result = md.convert("test.xlsx") print(result.text_content) ``` +Document Intelligence conversion in Python: + +```python +from markitdown import MarkItDown + +md = MarkItDown(docintel_endpoint="") +result = md.convert("test.pdf") +print(result.text_content) +``` + To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: ```python From 7bea2672a05f5877acb8690b20222593dab13788 Mon Sep 17 00:00:00 2001 From: ZeyuTeng96 <96521059+ZeyuTeng96@users.noreply.github.com> Date: Sun, 9 Feb 2025 12:28:35 +0800 Subject: [PATCH 3/5] remove leading and trailing \n for HtmlConverter (#262) --- src/markitdown/_markitdown.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index ae6a7b4f..6f405478 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -236,6 +236,9 @@ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: assert isinstance(webpage_text, str) + # remove leading and trailing \n + webpage_text = webpage_text.strip() + return DocumentConverterResult( title=None if soup.title is None else soup.title.string, text_content=webpage_text, From 3090917a49dc8ec94682c47747f3e2692e3953ae Mon Sep 17 00:00:00 2001 From: James Hickey Date: Sun, 9 Feb 2025 00:30:13 -0400 Subject: [PATCH 4/5] Typo fixed (#270) --- src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 6f405478..e4884ec2 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -217,7 +217,7 @@ def convert( return result def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: - """Helper function that converts and HTML string.""" + """Helper function that converts an HTML string.""" # Parse the string soup = BeautifulSoup(html_content, "html.parser") From 7cf5e0bb23980cd004ceeea476c1bde3246d3c84 Mon Sep 17 00:00:00 2001 From: masquare Date: Sun, 9 Feb 2025 05:37:34 +0100 Subject: [PATCH 5/5] feat(pptx): support image description with LLM for pptx files (#306) --- src/markitdown/_markitdown.py | 62 +++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index e4884ec2..9f610f6f 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -787,6 +787,35 @@ class PptxConverter(HtmlConverter): Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ + def _get_llm_description( + self, llm_client, llm_model, image_blob, content_type, prompt=None + ): + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed alt text for this image with less than 50 words." + + image_base64 = base64.b64encode(image_blob).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + {"type": "text", "text": prompt}, + ], + } + ] + + response = llm_client.chat.completions.create( + model=llm_model, messages=messages + ) + return response.choices[0].message.content + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a PPTX extension = kwargs.get("file_extension", "") @@ -807,17 +836,38 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Pictures if self._is_picture(shape): # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 - alt_text = "" - try: - alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") - except Exception: - pass + + llm_description = None + alt_text = None + + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: + try: + llm_description = self._get_llm_description( + llm_client, + llm_model, + shape.image.blob, + shape.image.content_type, + ) + except Exception: + # Unable to describe with LLM + pass + + if not llm_description: + try: + alt_text = shape._element._nvXxPr.cNvPr.attrib.get( + "descr", "" + ) + except Exception: + # Unable to get alt text + pass # A placeholder name filename = re.sub(r"\W", "", shape.name) + ".jpg" md_content += ( "\n![" - + (alt_text if alt_text else shape.name) + + (llm_description or alt_text or shape.name) + "](" + filename + ")\n"