Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added priority argument to all converter constructors. #324

Merged
merged 7 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")

PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
PRIORITY_GENERIC_FILE_FORMAT = 10.0


_plugins: Union[None | List[Any]] = None


Expand Down Expand Up @@ -123,6 +119,8 @@ def enable_builtins(self, **kwargs) -> None:
self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")
if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH")

# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
Expand Down Expand Up @@ -349,11 +347,10 @@ def _convert(
_kwargs["_parent_converters"] = self._page_converters

# If we hit an error log it and keep trying
# try:
if True:
try:
res = converter.convert(local_path, **_kwargs)
# except Exception:
# error_trace = ("\n\n" + traceback.format_exc()).strip()
except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip()

if res is not None:
# Normalize the content
Expand Down
31 changes: 30 additions & 1 deletion packages/markitdown/src/markitdown/converters/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,36 @@ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
class DocumentConverter:
"""Abstract superclass of all DocumentConverters."""

def __init__(self, priority: float = 0.0):
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)

def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
"""
Initialize the DocumentConverter with a given priority.

Priorities work as follows: By default, most converters get priority
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
with lower values being tried first (i.e., higher priority).

Just prior to conversion, the converters are sorted by priority, using
a stable sort. This means that converters with the same priority will
remain in the same order, with the most recently registered converters
appearing first.

We have tight control over the order of built-in converters, but
plugins can register converters in any order. A converter's priority
field reasserts some control over the order of converters.

Plugins can register converters with any priority, to appear before or
after the built-ins. For example, a plugin with priority 9 will run
before the PlainTextConverter, but after the built-in converters.
"""
self._priority = priority

def convert(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
NOTE: It is better to use the Bing API
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):

def __init__(
self,
*,
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
endpoint: str,
api_version: str = "2024-07-31-preview",
):
super().__init__(priority=priority)

self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
DocumentConverterResult,
)

from ._base import DocumentConverter
from ._html_converter import HtmlConverter


Expand All @@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter


Expand All @@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
class IpynbConverter(DocumentConverter):
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
Abstract class for multi-modal media (e.g., images and audio)
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)

def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
Expand All @@ -27,10 +32,10 @@ def _get_metadata(self, local_path, exiftool_path=None):

return None
else:
try:
if True:
result = subprocess.run(
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
except Exception:
return None
# except Exception:
# return None
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import tempfile
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings

Expand Down Expand Up @@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
- Email body content
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter

# Optional Transcription support
Expand All @@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter


Expand All @@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
class YouTubeConverter(DocumentConverter):
"""Handle YouTube specially, focusing on the video title, description, and transcript."""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
- Cleans up temporary files after processing
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
6 changes: 3 additions & 3 deletions packages/markitdown/tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:

if __name__ == "__main__":
"""Runs this file's tests from the command line."""
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_remote()
test_markitdown_local()
test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm()
print("All tests passed!")