From 4298cfad8d500e780312be79999499db5e6ed96b Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 11 Feb 2025 10:13:36 -0800 Subject: [PATCH 1/4] Added priority argument to all converter constructors. --- .../markitdown/src/markitdown/_markitdown.py | 23 +++++++++++++++---- .../src/markitdown/converters/_base.py | 10 +++++++- .../converters/_bing_serp_converter.py | 5 ++++ .../converters/_doc_intel_converter.py | 4 ++++ .../markitdown/converters/_docx_converter.py | 6 +++++ .../markitdown/converters/_html_converter.py | 5 ++++ .../markitdown/converters/_image_converter.py | 7 +++++- .../markitdown/converters/_ipynb_converter.py | 5 ++++ .../markitdown/converters/_media_converter.py | 11 ++++++--- .../markitdown/converters/_mp3_converter.py | 7 +++++- .../converters/_outlook_msg_converter.py | 5 ++++ .../markitdown/converters/_pdf_converter.py | 5 ++++ .../converters/_plain_text_converter.py | 5 ++++ .../markitdown/converters/_pptx_converter.py | 5 ++++ .../markitdown/converters/_rss_converter.py | 5 ++++ .../markitdown/converters/_wav_converter.py | 7 +++++- .../converters/_wikipedia_converter.py | 5 ++++ .../markitdown/converters/_xlsx_converter.py | 7 +++++- .../converters/_youtube_converter.py | 5 ++++ .../markitdown/converters/_zip_converter.py | 5 ++++ packages/markitdown/tests/test_markitdown.py | 6 ++--- 21 files changed, 128 insertions(+), 15 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index b7ac5bc..a438903 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -47,10 +47,6 @@ # Override mimetype for csv to fix issue on windows mimetypes.add_type("text/csv", ".csv") -PRIORITY_SPECIFIC_FILE_FORMAT = 0.0 -PRIORITY_GENERIC_FILE_FORMAT = 10.0 - - _plugins: Union[None | List[Any]] = None @@ -103,6 +99,23 @@ def __init__( # Register the converters self._page_converters: List[DocumentConverter] = [] + # Note: We have tight control over the order of built-in converters, but + # plugins can register converters in any order. A converter's .priority + # reasserts some control over the order of converters. + # + # Priorities work as follows. By default, most converters get priority + # DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception + # is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), + # with lower values being tried first (i.e., higher priority). + # + # Just prior to conversion, the converters are sorted by priority, using + # a stable sort. This means that converters with the same priority will + # remain in the same order, with the most recently registered converters + # appearing first. + # + # Plugins can register converters with any priority, to appear before or + # after the built-ins. For example, a plugin with priority 9 will run + # before the PlainTextConverter, but after the built-in converters. if ( enable_builtins is None or enable_builtins ): # Default to True when not specified @@ -123,6 +136,8 @@ def enable_builtins(self, **kwargs) -> None: self._llm_model = kwargs.get("llm_model") self._exiftool_path = kwargs.get("exiftool_path") self._style_map = kwargs.get("style_map") + if self._exiftool_path is None: + self._exiftool_path = os.getenv("EXIFTOOL_PATH") # Register converters for successful browsing operations # Later registrations are tried first / take higher priority than earlier registrations diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py index 6d0a5a4..0a768dc 100644 --- a/packages/markitdown/src/markitdown/converters/_base.py +++ b/packages/markitdown/src/markitdown/converters/_base.py @@ -12,7 +12,15 @@ def __init__(self, title: Union[str, None] = None, text_content: str = ""): class DocumentConverter: """Abstract superclass of all DocumentConverters.""" - def __init__(self, priority: float = 0.0): + # Lower priority values are tried first. + PRIORITY_SPECIFIC_FILE_FORMAT = ( + 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia + ) + PRIORITY_GENERIC_FILE_FORMAT = ( + 10.0 # Near catch-all converters for mimetypes like text/*, etc. + ) + + def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): self._priority = priority def convert( diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index b903724..d1b11a6 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter): NOTE: It is better to use the Bing API """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a Bing SERP extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 94acc9f..835345a 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter): def __init__( self, + *, + priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT, endpoint: str, api_version: str = "2024-07-31-preview", ): + super().__init__(priority=priority) + self.endpoint = endpoint self.api_version = api_version self.doc_intel_client = DocumentIntelligenceClient( diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index fb61cca..8515f6d 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -6,6 +6,7 @@ DocumentConverterResult, ) +from ._base import DocumentConverter from ._html_converter import HtmlConverter @@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index ae7259e..68c2536 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -8,6 +8,11 @@ class HtmlConverter(DocumentConverter): """Anything with content type text/html""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index f3dee6b..a46b67c 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,5 +1,5 @@ from typing import Union -from ._base import DocumentConverterResult +from ._base import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter @@ -8,6 +8,11 @@ class ImageConverter(MediaConverter): Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not an image extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index cdeb478..b487f41 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -12,6 +12,11 @@ class IpynbConverter(DocumentConverter): """Converts Jupyter Notebook (.ipynb) files to Markdown.""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_media_converter.py b/packages/markitdown/src/markitdown/converters/_media_converter.py index 07d2bde..5c7d82b 100644 --- a/packages/markitdown/src/markitdown/converters/_media_converter.py +++ b/packages/markitdown/src/markitdown/converters/_media_converter.py @@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter): Abstract class for multi-modal media (e.g., images and audio) """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def _get_metadata(self, local_path, exiftool_path=None): if not exiftool_path: which_exiftool = shutil.which("exiftool") @@ -27,10 +32,10 @@ def _get_metadata(self, local_path, exiftool_path=None): return None else: - try: + if True: result = subprocess.run( [exiftool_path, "-json", local_path], capture_output=True, text=True ).stdout return json.loads(result)[0] - except Exception: - return None + # except Exception: + # return None diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py index 6b2786b..91fd270 100644 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py @@ -1,6 +1,6 @@ import tempfile from typing import Union -from ._base import DocumentConverterResult +from ._base import DocumentConverter, DocumentConverterResult from ._wav_converter import WavConverter from warnings import resetwarnings, catch_warnings @@ -28,6 +28,11 @@ class Mp3Converter(WavConverter): Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a MP3 extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index e83001c..6764fc5 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter): - Email body content """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index dcffc62..3a2b671 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter): Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a PDF extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 2912d24..75f74a8 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -9,6 +9,11 @@ class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index a48880a..afb37a0 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter): Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def _get_llm_description( self, llm_client, llm_model, image_blob, content_type, prompt=None ): diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index eb2f09c..b279c85 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -9,6 +9,11 @@ class RssConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py index 6fc8932..3c8d842 100644 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py @@ -1,5 +1,5 @@ from typing import Union -from ._base import DocumentConverterResult +from ._base import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter # Optional Transcription support @@ -17,6 +17,11 @@ class WavConverter(MediaConverter): Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a WAV extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 4097ef0..f27fe23 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -10,6 +10,11 @@ class WikipediaConverter(DocumentConverter): """Handle Wikipedia pages separately, focusing only on the main document content.""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 683349c..2bdfd5d 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -2,7 +2,7 @@ import pandas as pd -from ._base import DocumentConverterResult +from ._base import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter @@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index fe198e8..b961b88 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -19,6 +19,11 @@ class YouTubeConverter(DocumentConverter): """Handle YouTube specially, focusing on the video title, description, and transcript.""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index 918c357..026900d 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter): - Cleans up temporary files after processing """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index be71722..efd45ac 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -327,8 +327,8 @@ def test_markitdown_llm() -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - # test_markitdown_remote() - # test_markitdown_local() + test_markitdown_remote() + test_markitdown_local() test_markitdown_exiftool() - # test_markitdown_deprecation() # test_markitdown_llm() + print("All tests passed!") From d1868f858819b30e6fcac38ec3c27dabe52fd998 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 11 Feb 2025 10:35:48 -0800 Subject: [PATCH 2/4] Re-enabled try-catch. --- packages/markitdown/src/markitdown/_markitdown.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index a438903..36c8afb 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -364,11 +364,10 @@ def _convert( _kwargs["_parent_converters"] = self._page_converters # If we hit an error log it and keep trying - # try: - if True: + try: res = converter.convert(local_path, **_kwargs) - # except Exception: - # error_trace = ("\n\n" + traceback.format_exc()).strip() + except Exception: + error_trace = ("\n\n" + traceback.format_exc()).strip() if res is not None: # Normalize the content From 540410e5c86a287f8d0a94a1494ece063ddce5c0 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 11 Feb 2025 12:31:17 -0800 Subject: [PATCH 3/4] Promote discussion of converter priority to a docstring. --- .../markitdown/src/markitdown/_markitdown.py | 18 --------------- .../src/markitdown/converters/_base.py | 22 +++++++++++++++++++ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 36c8afb..8669ad5 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -99,23 +99,6 @@ def __init__( # Register the converters self._page_converters: List[DocumentConverter] = [] - # Note: We have tight control over the order of built-in converters, but - # plugins can register converters in any order. A converter's .priority - # reasserts some control over the order of converters. - # - # Priorities work as follows. By default, most converters get priority - # DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception - # is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), - # with lower values being tried first (i.e., higher priority). - # - # Just prior to conversion, the converters are sorted by priority, using - # a stable sort. This means that converters with the same priority will - # remain in the same order, with the most recently registered converters - # appearing first. - # - # Plugins can register converters with any priority, to appear before or - # after the built-ins. For example, a plugin with priority 9 will run - # before the PlainTextConverter, but after the built-in converters. if ( enable_builtins is None or enable_builtins ): # Default to True when not specified @@ -128,7 +111,6 @@ def enable_builtins(self, **kwargs) -> None: """ Enable and register built-in converters. Built-in converters are enabled by default. - This method should only be called once, if built-ins were initially disabled. """ if not self._builtins_enabled: # TODO: Move these into converter constructors diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py index 0a768dc..6df37f6 100644 --- a/packages/markitdown/src/markitdown/converters/_base.py +++ b/packages/markitdown/src/markitdown/converters/_base.py @@ -21,6 +21,28 @@ class DocumentConverter: ) def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): + """ + Initialize the DocumentConverter with a given priority. + + Priorities work as follows: By default, most converters get priority + DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception + is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), + with lower values being tried first (i.e., higher priority). + + Just prior to conversion, the converters are sorted by priority, using + a stable sort. This means that converters with the same priority will + remain in the same order, with the most recently registered converters + appearing first. + + We have tight control over the order of built-in converters, but + plugins can register converters in any order. A converter's priority + field reasserts some control over the order of converters. + This method should only be called once, if built-ins were initially disabled. + + Plugins can register converters with any priority, to appear before or + after the built-ins. For example, a plugin with priority 9 will run + before the PlainTextConverter, but after the built-in converters. + """ self._priority = priority def convert( From 049b8f77f1f3e7e5f6372dd276fe6988611f28e1 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 11 Feb 2025 12:34:46 -0800 Subject: [PATCH 4/4] Fix docstring. --- packages/markitdown/src/markitdown/_markitdown.py | 1 + packages/markitdown/src/markitdown/converters/_base.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 8669ad5..297f554 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -111,6 +111,7 @@ def enable_builtins(self, **kwargs) -> None: """ Enable and register built-in converters. Built-in converters are enabled by default. + This method should only be called once, if built-ins were initially disabled. """ if not self._builtins_enabled: # TODO: Move these into converter constructors diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py index 6df37f6..3947797 100644 --- a/packages/markitdown/src/markitdown/converters/_base.py +++ b/packages/markitdown/src/markitdown/converters/_base.py @@ -37,7 +37,6 @@ def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): We have tight control over the order of built-in converters, but plugins can register converters in any order. A converter's priority field reasserts some control over the order of converters. - This method should only be called once, if built-ins were initially disabled. Plugins can register converters with any priority, to appear before or after the built-ins. For example, a plugin with priority 9 will run