Replace linkify-it-py dependency with a GFM autolink plugin for markd…

…own-it
hukkin · Dec 11, 2024 · 8813b42 · 8813b42
1 parent ec9e66f
commit 8813b42
Show file tree

Hide file tree

Showing 5 changed files with 235 additions and 23 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ license = { file = "LICENSE" }
 requires-python = ">=3.9"
 dependencies = [
     'mdformat >=0.7.5,<0.8.0',
-    'markdown-it-py[linkify]',  # Let `mdformat` choose version boundaries for `markdown-it-py`
+    'markdown-it-py',  # Let `mdformat` choose version boundaries for `markdown-it-py`
     'mdit-py-plugins >=0.2.0',
     'mdformat-tables >=0.4.0',
 ]
@@ -29,7 +29,7 @@ keywords = ["mdformat", "markdown", "formatter", "gfm"]
 "Homepage" = "https://github.com/hukkin/mdformat-gfm"
 
 [project.entry-points."mdformat.parser_extension"]
-"gfm" = "mdformat_gfm.plugin"
+"gfm" = "mdformat_gfm._mdformat_plugin"
 
 
 [tool.tox]

diff --git a/src/mdformat_gfm/plugin.py → src/mdformat_gfm/_mdformat_plugin.py b/src/mdformat_gfm/plugin.py → src/mdformat_gfm/_mdformat_plugin.py
@@ -5,14 +5,12 @@
 from mdformat.renderer import DEFAULT_RENDERERS, RenderContext, RenderTreeNode
 from mdit_py_plugins.tasklists import tasklists_plugin
 
-# A regex that matches a URL scheme and a following colon, as is valid in CommonMark
-RE_COMMONMARK_URL_SCHEME = re.compile("[A-Za-z][A-Za-z0-9+.-]{1,31}:")
+from mdformat_gfm._mdit_gfm_autolink_plugin import gfm_autolink_plugin
 
 
 def update_mdit(mdit: MarkdownIt) -> None:
-    # Enable linkify-it-py (for GFM autolink extension)
-    mdit.options["linkify"] = True
-    mdit.enable("linkify")
+    # Enable GFM autolink extension
+    mdit.use(gfm_autolink_plugin)
 
     # Enable mdformat-tables plugin
     tables_plugin = mdformat.plugins.PARSER_EXTENSIONS["tables"]
@@ -111,20 +109,8 @@ def _postprocess_inline(text: str, node: RenderTreeNode, context: RenderContext)
     return text
 
 
-def _link_renderer(node: RenderTreeNode, context: RenderContext) -> str:
-    """Extend the default link renderer to handle linkify links."""
-    if node.markup == "linkify":
-        autolink_url = node.attrs["href"]
-        assert isinstance(autolink_url, str)
-        startswith_scheme = RE_COMMONMARK_URL_SCHEME.match(autolink_url)
-        if startswith_scheme and not node.children[0].content.startswith(
-            startswith_scheme.group()
-        ):
-            autolink_url = autolink_url.split(":", maxsplit=1)[1]
-            if autolink_url.startswith("//"):
-                autolink_url = autolink_url[2:]
-        return autolink_url
-    return _render_with_default_renderer(node, context)
+def _gfm_autolink_renderer(node: RenderTreeNode, context: RenderContext) -> str:
+    return node.meta["source_autolink"]
 
 
 def _escape_text(text: str, node: RenderTreeNode, context: RenderContext) -> str:
@@ -147,7 +133,7 @@ def _escape_paragraph(text: str, node: RenderTreeNode, context: RenderContext) -
 RENDERERS = {
     "s": _strikethrough_renderer,
     "list_item": _list_item_renderer,
-    "link": _link_renderer,
+    "gfm_autolink": _gfm_autolink_renderer,
 }
 POSTPROCESSORS = {
     "text": _escape_text,

diff --git a/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py b/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py
@@ -0,0 +1,216 @@
+import re
+
+from markdown_it import MarkdownIt
+from markdown_it.rules_inline import StateInline
+
+
+def gfm_autolink_plugin(md: MarkdownIt) -> None:
+    """Markdown-it plugin to parse GFM autolinks."""
+    md.inline.ruler.before("linkify", "gfm_autolink", gfm_autolink)
+    # "text" inline rule will skip "www." prefixed links, so needs to be
+    # disabled. This is probably disastrous for performance. An alternative, I think,
+    # would be to override the "text" inline rule with one that stops at a "."
+    # prefixed by "www".
+    md.inline.ruler.disable("text")
+
+
+# A string that matches this must still be invalidated if it ends with "_" or "-"
+RE_GFM_EMAIL = re.compile(r"[a-zA-Z0-9._+-]+@[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+")
+# A string that matches this must still be invalidated if last two segments contain "_"
+RE_GFM_AUTOLINK_DOMAIN = re.compile(r"[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+")
+
+RE_ENDS_IN_ENTITY_REF = re.compile(r"&[a-zA-Z0-9]+;\Z")
+
+# Whitespace characters, as specified in
+# https://github.github.com/gfm/#whitespace-character
+# (spec version 0.29-gfm (2019-04-06)
+GFM_WHITESPACE = frozenset(" \t\n\v\f\r")
+
+ASCII_ALPHANUMERICS = frozenset(
+    "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789"
+)
+
+
+def gfm_autolink(state: StateInline, silent: bool) -> bool:  # noqa: C901
+    """Markdown-it-py rule to parse GFM autolinks.
+
+    This parser autolinks as specified here:
+    https://github.github.com/gfm/#autolinks-extension-
+
+    Args:
+        state: Parse state object.
+        silent: Disables token generation.
+    Returns:
+        bool: True if GFM autolink found.
+    """
+    pos = state.pos
+    src = state.src
+
+    # Autolink can only be at the beginning of a line, after whitespace,
+    # or any of the delimiting characters *, _, ~, and (.
+    if pos:
+        preceding_char = src[pos - 1]
+        if preceding_char not in GFM_WHITESPACE | {"*", "_", "~", "("}:
+            return False
+
+    if src.startswith("www.", pos):
+        pos += 4
+        try:
+            pos, domain, resource = read_domain_and_resource(src, pos)
+        except NotFound:
+            return False
+
+        url = f"www.{domain}{resource}"
+        full_url = "http://" + url
+    elif src.startswith(("http://", "https://"), pos):
+        scheme = "https://" if src[pos + 4] == "s" else "http://"
+        pos += len(scheme)
+
+        try:
+            pos, domain, resource = read_domain_and_resource(src, pos)
+        except NotFound:
+            return False
+
+        url = f"{scheme}{domain}{resource}"
+        full_url = url
+    elif src.startswith(("mailto:", "xmpp:"), pos):
+        scheme = "xmpp:" if src[pos] == "x" else "mailto:"
+        pos += len(scheme)
+
+        try:
+            pos, email = read_email(src, pos)
+        except NotFound:
+            return False
+
+        if scheme == "xmpp:" and src[pos : pos + 1] == "/":
+            pos += 1
+            resource_start_pos = pos
+            while pos < len(src) and src[pos] in ASCII_ALPHANUMERICS | {".", "@"}:
+                pos += 1
+            resource = src[resource_start_pos:pos]
+            if resource.endswith("."):
+                pos -= 1
+                resource = resource[:-1]
+            if not resource:
+                return False
+        else:
+            resource = ""
+
+        source_autolink = scheme + email
+        if resource:
+            source_autolink += "/" + resource
+
+        url = source_autolink
+        full_url = source_autolink
+    else:
+        try:
+            pos, email = read_email(src, pos)
+        except NotFound:
+            return False
+
+        url = email
+        full_url = "mailto:" + email
+
+    normalized_full_url = state.md.normalizeLink(full_url)
+    if not state.md.validateLink(normalized_full_url):
+        return False
+
+    push_tokens(state, normalized_full_url, url, silent)
+    state.pos = pos
+    return True
+
+
+def push_tokens(
+    state: StateInline, full_url: str, source_url: str, silent: bool
+) -> None:
+    if silent:
+        return
+    token = state.push("gfm_autolink_open", "a", 1)
+    token.attrs = {"href": full_url}
+    token.meta = {"source_autolink": source_url}
+
+    token = state.push("text", "", 0)
+    token.content = state.md.normalizeLinkText(source_url)
+
+    state.push("gfm_autolink_close", "a", -1)
+
+
+def trim_resource(untrimmed: str) -> tuple[str, int]:
+    """Trim illegal trailing chars from autolink resource.
+
+    Trim trailing punctuation, parentheses and entity refs as per GFM
+    spec. Also trim backslashes. The spec does not mention backslash,
+    but I think it should. This is referred to as "extended autolink
+    path validation" in the GFM spec. Return a tuple with the trimmed
+    resource and the amount of characters removed.
+    """
+    i = len(untrimmed) - 1
+    while i >= 0:
+        c = untrimmed[i]
+        if c == ";":
+            ending_entity_match = RE_ENDS_IN_ENTITY_REF.search(untrimmed, endpos=i + 1)
+            if not ending_entity_match:
+                break
+            i = ending_entity_match.start()
+        elif c == ")":
+            if untrimmed.count("(", 0, i + 1) >= untrimmed.count(")", 0, i + 1):
+                break
+        elif c in {"?", "!", ".", ",", ":", "*", "_", "~"}:
+            pass
+        elif c == "\\":  # not part of the spec, but should be
+            pass
+        else:
+            break
+        i -= 1
+
+    trimmed = untrimmed[: i + 1]
+    trim_count = len(untrimmed) - len(trimmed)
+    return trimmed, trim_count
+
+
+class NotFound(Exception):
+    """Raised if a function didn't find what it was looking for."""
+
+
+def read_domain_and_resource(src: str, pos: int) -> tuple[int, str, str]:
+    """Read autolink domain and resource.
+
+    Raise NotFound if not found. Return a tuple (pos, domain, resource).
+    """
+    domain_match = RE_GFM_AUTOLINK_DOMAIN.match(src, pos)
+    if not domain_match:
+        raise NotFound
+    domain = domain_match.group()
+    pos = domain_match.end()
+    segments = domain.rsplit(".", 2)
+    if "_" in segments[-2] or "_" in segments[-1]:
+        raise NotFound
+
+    resource_start_pos = pos
+    while pos < len(src) and src[pos] not in GFM_WHITESPACE | {"<"}:
+        pos += 1
+    resource = src[resource_start_pos:pos]
+
+    resource, trim_count = trim_resource(resource)
+    pos -= trim_count
+    return pos, domain, resource
+
+
+def read_email(src: str, pos: int) -> tuple[int, str]:
+    """Read autolink email.
+
+    Raise NotFound if not found. Return a tuple (pos, email).
+    """
+    email_match = RE_GFM_EMAIL.match(src, pos)
+    email = email_match.group() if email_match else None
+    if not email or email[-1] in {"-", "_"}:
+        raise NotFound
+    pos = email_match.end()
+
+    # This isn't really part of the GFM spec, but an attempt to cover
+    # up its flaws. If a trailing hyphen or underscore invalidates an
+    # autolink, then an escaped hyphen or underscore should too.
+    if src[pos : pos + 2] in {"\\-", "\\_"}:
+        raise NotFound
+
+    return pos, email
diff --git a/tests/data/default_style.md b/tests/data/default_style.md
@@ -62,7 +62,7 @@ Autolink with a backslash
 .
 http://www.python.org/autolink\extension
 .
-http://www.python.org/autolink%5Cextension
+http://www.python.org/autolink\extension
 .
 
 Autolink with percentage encoded space

diff --git a/tests/test_markdown_it_plugin.py b/tests/test_markdown_it_plugin.py
@@ -0,0 +1,10 @@
+from markdown_it import MarkdownIt
+
+from mdformat_gfm._mdit_gfm_autolink_plugin import gfm_autolink_plugin
+
+def test_gfm_autolink():
+    mdit = MarkdownIt()
+    mdit.use(gfm_autolink_plugin)
+    text = "GFM autolink www.commonmark.org"
+    html = mdit.render(text)
+    assert html == '<p>GFM autolink <a href="http://www.commonmark.org">www.commonmark.org</a></p>\n'