Skip to content

Commit

Permalink
Replace linkify-it-py dependency with a GFM autolink plugin for markd…
Browse files Browse the repository at this point in the history
…own-it
  • Loading branch information
hukkin committed Dec 11, 2024
1 parent ec9e66f commit 8813b42
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 23 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ license = { file = "LICENSE" }
requires-python = ">=3.9"
dependencies = [
'mdformat >=0.7.5,<0.8.0',
'markdown-it-py[linkify]', # Let `mdformat` choose version boundaries for `markdown-it-py`
'markdown-it-py', # Let `mdformat` choose version boundaries for `markdown-it-py`
'mdit-py-plugins >=0.2.0',
'mdformat-tables >=0.4.0',
]
Expand All @@ -29,7 +29,7 @@ keywords = ["mdformat", "markdown", "formatter", "gfm"]
"Homepage" = "https://github.com/hukkin/mdformat-gfm"

[project.entry-points."mdformat.parser_extension"]
"gfm" = "mdformat_gfm.plugin"
"gfm" = "mdformat_gfm._mdformat_plugin"


[tool.tox]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
from mdformat.renderer import DEFAULT_RENDERERS, RenderContext, RenderTreeNode
from mdit_py_plugins.tasklists import tasklists_plugin

# A regex that matches a URL scheme and a following colon, as is valid in CommonMark
RE_COMMONMARK_URL_SCHEME = re.compile("[A-Za-z][A-Za-z0-9+.-]{1,31}:")
from mdformat_gfm._mdit_gfm_autolink_plugin import gfm_autolink_plugin


def update_mdit(mdit: MarkdownIt) -> None:
# Enable linkify-it-py (for GFM autolink extension)
mdit.options["linkify"] = True
mdit.enable("linkify")
# Enable GFM autolink extension
mdit.use(gfm_autolink_plugin)

# Enable mdformat-tables plugin
tables_plugin = mdformat.plugins.PARSER_EXTENSIONS["tables"]
Expand Down Expand Up @@ -111,20 +109,8 @@ def _postprocess_inline(text: str, node: RenderTreeNode, context: RenderContext)
return text


def _link_renderer(node: RenderTreeNode, context: RenderContext) -> str:
"""Extend the default link renderer to handle linkify links."""
if node.markup == "linkify":
autolink_url = node.attrs["href"]
assert isinstance(autolink_url, str)
startswith_scheme = RE_COMMONMARK_URL_SCHEME.match(autolink_url)
if startswith_scheme and not node.children[0].content.startswith(
startswith_scheme.group()
):
autolink_url = autolink_url.split(":", maxsplit=1)[1]
if autolink_url.startswith("//"):
autolink_url = autolink_url[2:]
return autolink_url
return _render_with_default_renderer(node, context)
def _gfm_autolink_renderer(node: RenderTreeNode, context: RenderContext) -> str:
return node.meta["source_autolink"]


def _escape_text(text: str, node: RenderTreeNode, context: RenderContext) -> str:
Expand All @@ -147,7 +133,7 @@ def _escape_paragraph(text: str, node: RenderTreeNode, context: RenderContext) -
RENDERERS = {
"s": _strikethrough_renderer,
"list_item": _list_item_renderer,
"link": _link_renderer,
"gfm_autolink": _gfm_autolink_renderer,
}
POSTPROCESSORS = {
"text": _escape_text,
Expand Down
216 changes: 216 additions & 0 deletions src/mdformat_gfm/_mdit_gfm_autolink_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import re

from markdown_it import MarkdownIt
from markdown_it.rules_inline import StateInline


def gfm_autolink_plugin(md: MarkdownIt) -> None:
"""Markdown-it plugin to parse GFM autolinks."""
md.inline.ruler.before("linkify", "gfm_autolink", gfm_autolink)
# "text" inline rule will skip "www." prefixed links, so needs to be
# disabled. This is probably disastrous for performance. An alternative, I think,
# would be to override the "text" inline rule with one that stops at a "."
# prefixed by "www".
md.inline.ruler.disable("text")


# A string that matches this must still be invalidated if it ends with "_" or "-"
RE_GFM_EMAIL = re.compile(r"[a-zA-Z0-9._+-]+@[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+")
# A string that matches this must still be invalidated if last two segments contain "_"
RE_GFM_AUTOLINK_DOMAIN = re.compile(r"[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+")

RE_ENDS_IN_ENTITY_REF = re.compile(r"&[a-zA-Z0-9]+;\Z")

# Whitespace characters, as specified in
# https://github.github.com/gfm/#whitespace-character
# (spec version 0.29-gfm (2019-04-06)
GFM_WHITESPACE = frozenset(" \t\n\v\f\r")

ASCII_ALPHANUMERICS = frozenset(
"abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789"
)


def gfm_autolink(state: StateInline, silent: bool) -> bool: # noqa: C901
"""Markdown-it-py rule to parse GFM autolinks.
This parser autolinks as specified here:
https://github.github.com/gfm/#autolinks-extension-
Args:
state: Parse state object.
silent: Disables token generation.
Returns:
bool: True if GFM autolink found.
"""
pos = state.pos
src = state.src

# Autolink can only be at the beginning of a line, after whitespace,
# or any of the delimiting characters *, _, ~, and (.
if pos:
preceding_char = src[pos - 1]
if preceding_char not in GFM_WHITESPACE | {"*", "_", "~", "("}:
return False

if src.startswith("www.", pos):
pos += 4
try:
pos, domain, resource = read_domain_and_resource(src, pos)
except NotFound:
return False

url = f"www.{domain}{resource}"
full_url = "http://" + url
elif src.startswith(("http://", "https://"), pos):
scheme = "https://" if src[pos + 4] == "s" else "http://"
pos += len(scheme)

try:
pos, domain, resource = read_domain_and_resource(src, pos)
except NotFound:
return False

url = f"{scheme}{domain}{resource}"
full_url = url
elif src.startswith(("mailto:", "xmpp:"), pos):
scheme = "xmpp:" if src[pos] == "x" else "mailto:"
pos += len(scheme)

try:
pos, email = read_email(src, pos)
except NotFound:
return False

if scheme == "xmpp:" and src[pos : pos + 1] == "/":
pos += 1
resource_start_pos = pos
while pos < len(src) and src[pos] in ASCII_ALPHANUMERICS | {".", "@"}:
pos += 1
resource = src[resource_start_pos:pos]
if resource.endswith("."):
pos -= 1
resource = resource[:-1]
if not resource:
return False
else:
resource = ""

source_autolink = scheme + email
if resource:
source_autolink += "/" + resource

url = source_autolink
full_url = source_autolink
else:
try:
pos, email = read_email(src, pos)
except NotFound:
return False

url = email
full_url = "mailto:" + email

normalized_full_url = state.md.normalizeLink(full_url)
if not state.md.validateLink(normalized_full_url):
return False

push_tokens(state, normalized_full_url, url, silent)
state.pos = pos
return True


def push_tokens(
state: StateInline, full_url: str, source_url: str, silent: bool
) -> None:
if silent:
return
token = state.push("gfm_autolink_open", "a", 1)
token.attrs = {"href": full_url}
token.meta = {"source_autolink": source_url}

token = state.push("text", "", 0)
token.content = state.md.normalizeLinkText(source_url)

state.push("gfm_autolink_close", "a", -1)


def trim_resource(untrimmed: str) -> tuple[str, int]:
"""Trim illegal trailing chars from autolink resource.
Trim trailing punctuation, parentheses and entity refs as per GFM
spec. Also trim backslashes. The spec does not mention backslash,
but I think it should. This is referred to as "extended autolink
path validation" in the GFM spec. Return a tuple with the trimmed
resource and the amount of characters removed.
"""
i = len(untrimmed) - 1
while i >= 0:
c = untrimmed[i]
if c == ";":
ending_entity_match = RE_ENDS_IN_ENTITY_REF.search(untrimmed, endpos=i + 1)
if not ending_entity_match:
break
i = ending_entity_match.start()
elif c == ")":
if untrimmed.count("(", 0, i + 1) >= untrimmed.count(")", 0, i + 1):
break
elif c in {"?", "!", ".", ",", ":", "*", "_", "~"}:
pass
elif c == "\\": # not part of the spec, but should be
pass
else:
break
i -= 1

trimmed = untrimmed[: i + 1]
trim_count = len(untrimmed) - len(trimmed)
return trimmed, trim_count


class NotFound(Exception):
"""Raised if a function didn't find what it was looking for."""


def read_domain_and_resource(src: str, pos: int) -> tuple[int, str, str]:
"""Read autolink domain and resource.
Raise NotFound if not found. Return a tuple (pos, domain, resource).
"""
domain_match = RE_GFM_AUTOLINK_DOMAIN.match(src, pos)
if not domain_match:
raise NotFound
domain = domain_match.group()
pos = domain_match.end()
segments = domain.rsplit(".", 2)
if "_" in segments[-2] or "_" in segments[-1]:
raise NotFound

resource_start_pos = pos
while pos < len(src) and src[pos] not in GFM_WHITESPACE | {"<"}:
pos += 1
resource = src[resource_start_pos:pos]

resource, trim_count = trim_resource(resource)
pos -= trim_count
return pos, domain, resource


def read_email(src: str, pos: int) -> tuple[int, str]:
"""Read autolink email.
Raise NotFound if not found. Return a tuple (pos, email).
"""
email_match = RE_GFM_EMAIL.match(src, pos)
email = email_match.group() if email_match else None
if not email or email[-1] in {"-", "_"}:
raise NotFound
pos = email_match.end()

# This isn't really part of the GFM spec, but an attempt to cover
# up its flaws. If a trailing hyphen or underscore invalidates an
# autolink, then an escaped hyphen or underscore should too.
if src[pos : pos + 2] in {"\\-", "\\_"}:
raise NotFound

return pos, email
2 changes: 1 addition & 1 deletion tests/data/default_style.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Autolink with a backslash
.
http://www.python.org/autolink\extension
.
http://www.python.org/autolink%5Cextension
http://www.python.org/autolink\extension
.

Autolink with percentage encoded space
Expand Down
10 changes: 10 additions & 0 deletions tests/test_markdown_it_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from markdown_it import MarkdownIt

from mdformat_gfm._mdit_gfm_autolink_plugin import gfm_autolink_plugin

def test_gfm_autolink():
mdit = MarkdownIt()
mdit.use(gfm_autolink_plugin)
text = "GFM autolink www.commonmark.org"
html = mdit.render(text)
assert html == '<p>GFM autolink <a href="http://www.commonmark.org">www.commonmark.org</a></p>\n'

0 comments on commit 8813b42

Please sign in to comment.