From bfde8574204d1111309fbbff0b4347a3bf518676 Mon Sep 17 00:00:00 2001
From: KennyZhang1 <90438893+KennyZhang1@users.noreply.github.com>
Date: Fri, 24 Jan 2025 17:09:32 -0500
Subject: [PATCH 1/5] Add support for conversion via Document Intelligence
 (#303)

* added cli params for doc intel

* added DocumentIntelligenceConverter class implementation

* initialized doc intel client instance field

* added isolated doc_intel main conversion function

* temp fix for ContentFormat import bug

* ran tests for docintel and offline for many filetypes

* push doc intel converter to the top of the stack

* formatting changes

* modified project toml file
---
 pyproject.toml                |  2 +
 src/markitdown/__main__.py    | 33 ++++++++++---
 src/markitdown/_markitdown.py | 88 +++++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9c113ade..2a4e203e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,8 @@ dependencies = [
   "pathvalidate",
   "charset-normalizer",
   "openai",
+  "azure-ai-documentintelligence",
+  "azure-identity"
 ]
 
 [project.urls]
diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
index b6cf963b..69e8f0ea 100644
--- a/src/markitdown/__main__.py
+++ b/src/markitdown/__main__.py
@@ -4,8 +4,8 @@
 import argparse
 import sys
 from textwrap import dedent
-from .__about__ import __version__
-from ._markitdown import MarkItDown, DocumentConverterResult
+from __about__ import __version__
+from _markitdown import MarkItDown, DocumentConverterResult
 
 
 def main():
@@ -57,16 +57,37 @@ def main():
         "--output",
         help="Output file name. If not provided, output is written to stdout.",
     )
+    parser.add_argument(
+        "-d",
+        "--use-docintel",
+        action="store_true",
+        help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
+    )
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        type=str,
+        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
+    )
     args = parser.parse_args()
 
-    if args.filename is None:
+    if args.use_docintel:
+        if args.endpoint is None:
+            raise ValueError(
+                "Document Intelligence Endpoint is required when using Document Intelligence."
+            )
+        elif args.filename is None:
+            raise ValueError("Filename is required when using Document Intelligence.")
+        markitdown = MarkItDown(docintel_endpoint=args.endpoint)
+    else:
         markitdown = MarkItDown()
+
+    if args.filename is None:
         result = markitdown.convert_stream(sys.stdin.buffer)
-        _handle_output(args, result)
     else:
-        markitdown = MarkItDown()
         result = markitdown.convert(args.filename)
-        _handle_output(args, result)
+
+    _handle_output(args, result)
 
 
 def _handle_output(args, result: DocumentConverterResult):
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 33806e13..ae6a7b4f 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -33,6 +33,19 @@
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path
 
+# Azure imports
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import (
+    AnalyzeDocumentRequest,
+    AnalyzeResult,
+    DocumentAnalysisFeature,
+)
+from azure.identity import DefaultAzureCredential
+
+# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
+# This constant is a temporary fix until the bug is resolved.
+CONTENT_FORMAT = "markdown"
+
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@@ -1318,6 +1331,74 @@ def convert(
             )
 
 
+class DocumentIntelligenceConverter(DocumentConverter):
+    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        api_version: str = "2024-07-31-preview",
+    ):
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.doc_intel_client = DocumentIntelligenceClient(
+            endpoint=self.endpoint,
+            api_version=self.api_version,
+            credential=DefaultAzureCredential(),
+        )
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if extension is not supported by Document Intelligence
+        extension = kwargs.get("file_extension", "")
+        docintel_extensions = [
+            ".pdf",
+            ".docx",
+            ".xlsx",
+            ".pptx",
+            ".html",
+            ".jpeg",
+            ".jpg",
+            ".png",
+            ".bmp",
+            ".tiff",
+            ".heif",
+        ]
+        if extension.lower() not in docintel_extensions:
+            return None
+
+        # Get the bytestring for the local path
+        with open(local_path, "rb") as f:
+            file_bytes = f.read()
+
+        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
+        if extension.lower() in [".xlsx", ".pptx", ".html"]:
+            analysis_features = []
+        else:
+            analysis_features = [
+                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
+                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
+                DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
+            ]
+
+        # Extract the text using Azure Document Intelligence
+        poller = self.doc_intel_client.begin_analyze_document(
+            model_id="prebuilt-layout",
+            body=AnalyzeDocumentRequest(bytes_source=file_bytes),
+            features=analysis_features,
+            output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
+        )
+        result: AnalyzeResult = poller.result()
+
+        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
+        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+        return DocumentConverterResult(
+            title=None,
+            text_content=markdown_text,
+        )
+
+
 class FileConversionException(BaseException):
     pass
 
@@ -1337,6 +1418,7 @@ def __init__(
         llm_model: Optional[str] = None,
         style_map: Optional[str] = None,
         exiftool_path: Optional[str] = None,
+        docintel_endpoint: Optional[str] = None,
         # Deprecated
         mlm_client: Optional[Any] = None,
         mlm_model: Optional[str] = None,
@@ -1406,6 +1488,12 @@ def __init__(
         self.register_page_converter(ZipConverter())
         self.register_page_converter(OutlookMsgConverter())
 
+        # Register Document Intelligence converter at the top of the stack if endpoint is provided
+        if docintel_endpoint is not None:
+            self.register_page_converter(
+                DocumentIntelligenceConverter(endpoint=docintel_endpoint)
+            )
+
     def convert(
         self, source: Union[str, requests.Response, Path], **kwargs: Any
     ) -> DocumentConverterResult:  # TODO: deal with kwargs

From bf6a15e9b5eb89820bf82c04cbe934bf62fb8617 Mon Sep 17 00:00:00 2001
From: KennyZhang1 <90438893+KennyZhang1@users.noreply.github.com>
Date: Sat, 1 Feb 2025 01:23:26 -0500
Subject: [PATCH 2/5] Kennyzhang/docintel docs (#312)

* updated docs to include doc intelligence

* include reference to doc intel setup docs
---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 6bc91e6c..76a4d3f1 100644
--- a/README.md
+++ b/README.md
@@ -33,12 +33,20 @@ Or use `-o` to specify the output file:
 markitdown path-to-file.pdf -o document.md
 ```
 
+To use Document Intelligence conversion:
+
+```bash
+markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
+```
+
 You can also pipe content:
 
 ```bash
 cat path-to-file.pdf | markitdown
 ```
 
+More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
+
 ### Python API
 
 Basic usage in Python:
@@ -51,6 +59,16 @@ result = md.convert("test.xlsx")
 print(result.text_content)
 ```
 
+Document Intelligence conversion in Python:
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
+result = md.convert("test.pdf")
+print(result.text_content)
+```
+
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
 
 ```python

From 7bea2672a05f5877acb8690b20222593dab13788 Mon Sep 17 00:00:00 2001
From: ZeyuTeng96 <96521059+ZeyuTeng96@users.noreply.github.com>
Date: Sun, 9 Feb 2025 12:28:35 +0800
Subject: [PATCH 3/5] remove leading and trailing \n for HtmlConverter (#262)

---
 src/markitdown/_markitdown.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index ae6a7b4f..6f405478 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -236,6 +236,9 @@ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
 
         assert isinstance(webpage_text, str)
 
+        # remove leading and trailing \n
+        webpage_text = webpage_text.strip()
+
         return DocumentConverterResult(
             title=None if soup.title is None else soup.title.string,
             text_content=webpage_text,

From 3090917a49dc8ec94682c47747f3e2692e3953ae Mon Sep 17 00:00:00 2001
From: James Hickey <jamesmh@users.noreply.github.com>
Date: Sun, 9 Feb 2025 00:30:13 -0400
Subject: [PATCH 4/5] Typo fixed (#270)

---
 src/markitdown/_markitdown.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 6f405478..e4884ec2 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -217,7 +217,7 @@ def convert(
         return result
 
     def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts and HTML string."""
+        """Helper function that converts an HTML string."""
 
         # Parse the string
         soup = BeautifulSoup(html_content, "html.parser")

From 7cf5e0bb23980cd004ceeea476c1bde3246d3c84 Mon Sep 17 00:00:00 2001
From: masquare <masquare@users.noreply.github.com>
Date: Sun, 9 Feb 2025 05:37:34 +0100
Subject: [PATCH 5/5] feat(pptx): support image description with LLM for pptx
 files (#306)

---
 src/markitdown/_markitdown.py | 62 +++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index e4884ec2..9f610f6f 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -787,6 +787,35 @@ class PptxConverter(HtmlConverter):
     Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
     """
 
+    def _get_llm_description(
+        self, llm_client, llm_model, image_blob, content_type, prompt=None
+    ):
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed alt text for this image with less than 50 words."
+
+        image_base64 = base64.b64encode(image_blob).decode("utf-8")
+        data_uri = f"data:{content_type};base64,{image_base64}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri,
+                        },
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+
+        response = llm_client.chat.completions.create(
+            model=llm_model, messages=messages
+        )
+        return response.choices[0].message.content
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a PPTX
         extension = kwargs.get("file_extension", "")
@@ -807,17 +836,38 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
                 # Pictures
                 if self._is_picture(shape):
                     # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
-                    alt_text = ""
-                    try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
-                    except Exception:
-                        pass
+
+                    llm_description = None
+                    alt_text = None
+
+                    llm_client = kwargs.get("llm_client")
+                    llm_model = kwargs.get("llm_model")
+                    if llm_client is not None and llm_model is not None:
+                        try:
+                            llm_description = self._get_llm_description(
+                                llm_client,
+                                llm_model,
+                                shape.image.blob,
+                                shape.image.content_type,
+                            )
+                        except Exception:
+                            # Unable to describe with LLM
+                            pass
+
+                    if not llm_description:
+                        try:
+                            alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+                                "descr", ""
+                            )
+                        except Exception:
+                            # Unable to get alt text
+                            pass
 
                     # A placeholder name
                     filename = re.sub(r"\W", "", shape.name) + ".jpg"
                     md_content += (
                         "\n!["
-                        + (alt_text if alt_text else shape.name)
+                        + (llm_description or alt_text or shape.name)
                         + "]("
                         + filename
                         + ")\n"