Skip to content

Commit

Permalink
Refactor TOC sanitation
Browse files Browse the repository at this point in the history
- All postprocessors are run on heading content (not just
  `RawHtmlPostprocessor`).
- Footnote references are stripped from heading content. Fixes Python-Markdown#660.
- A more robust `striptags` is provided to convert headings to plain text.
  Unlike, markupsafe's implementation, HTML entities are not unescaped.
- Both the plain text `name` and rich `html` are saved to `toc_tokens`,
  which means users can now access the full rich text content of the
  headings directly from the `toc_tokens`.
- `data-toc-label` is sanitized separate from heading content.
- A `html.unescape` call added to `slugify` and `slugify_unicode`, which
  ensures `slugify` operates on Unicode characters, rather than HTML
  entities. By including in the functions, users can override with their
  own slugify functions if they desire.

Note that this first commit includes minimal changes to the tests to show
very little change in behavior (mostly the new `html` attribute of the
`toc_tokens` was added). A refactoring of the tests will be in a separate
commit.
  • Loading branch information
waylan committed Feb 9, 2024
1 parent 421f1e8 commit a8bb59e
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 60 deletions.
133 changes: 90 additions & 43 deletions markdown/extensions/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@

from . import Extension
from ..treeprocessors import Treeprocessor
from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString
from ..util import parseBoolValue, AMP_SUBSTITUTE
from ..treeprocessors import UnescapeTreeprocessor
from ..serializers import RE_AMP
import re
import html
import unicodedata
from copy import deepcopy
from html import unescape as html_unescape
import xml.etree.ElementTree as etree
from typing import TYPE_CHECKING, Any, Iterator, MutableSet

Expand All @@ -35,6 +38,8 @@

def slugify(value: str, separator: str, unicode: bool = False) -> str:
""" Slugify a string, to make it URL friendly. """
# First convert HTML entities to Unicode characters
value = html_unescape(value)
if not unicode:
# Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty`
value = unicodedata.normalize('NFKD', value)
Expand Down Expand Up @@ -63,41 +68,81 @@ def unique(id: str, ids: MutableSet[str]) -> str:
return id


def get_name(el: etree.Element) -> str:
"""Get title name."""

text = []
for c in el.itertext():
if isinstance(c, AtomicString):
text.append(html.unescape(c))
else:
text.append(c)
return ''.join(text).strip()


def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str:
""" Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
def _html_sub(m: re.Match[str]) -> str:
""" Substitute raw html with plain text. """
try:
raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
except (IndexError, TypeError): # pragma: no cover
return m.group(0)
# Strip out tags and/or entities - leaving text
res = re.sub(r'(<[^>]+>)', '', raw)
if strip_entities:
res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)
return res

return HTML_PLACEHOLDER_RE.sub(_html_sub, text)


def unescape(text: str) -> str:
""" Unescape escaped text. """
def md_unescape(text: str) -> str:
""" Unescape Markdown backslash escaped text. """
c = UnescapeTreeprocessor()
return c.unescape(text)


def strip_tags(text: str) -> str:
""" Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
# A comment could contain a tag, so strip comments first
while (start := text.find('<!--')) != -1 and (end := text.find('-->', start)) != -1:
text = f'{text[:start]}{text[end + 3:]}'

while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:
text = f'{text[:start]}{text[end + 1:]}'

# Collapse whitespace
text = ' '.join(text.split())
return text


def escape_cdata(text: str) -> str:
""" Escape character data. """
if "&" in text:
# Only replace & when not part of an entity
text = RE_AMP.sub('&amp;', text)
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
return text


def run_postprocessors(text: str, md: Markdown) -> str:
""" Run postprocessors from Markdown instance on text. """
for pp in md.postprocessors:
text = pp.run(text)
return text.strip()


def render_inner_html(el: etree.Element, md: Markdown) -> str:
""" Fully render inner html of an etree element as a string. """
# The UnescapeTreeprocessor runs after TOC so run here.
text = md_unescape(md.serializer(el))

# strip parent tag
start = text.index('>') + 1
end = text.rindex('<')
text = text[start:end].strip()

return run_postprocessors(text, md)


def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element:
""" Return a deep copy of an etree element, optionally with footnote references removed. """
el = deepcopy(el)
# Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.
if exclude_fnrefs:
for sup in el.findall('sup'):
id = sup.get('id', '')
if id.startswith('fnref'):
# We have a footnote reference. Remove it.
parent = el.find(f'.//sup[@id="{id}"]..')
if sup.tail:
# Preserve the tail text
siblings = list(parent)
pos = siblings.index(sup)
if pos == 0:
parent.text = f'{parent.text or ""}{sup.tail}'
else:
sibling = siblings[pos - 1]
sibling.tail = f'{sibling.tail or ""}{sup.tail}'
parent.remove(sup)
return el


def nest_toc_tokens(toc_list):
"""Given an unsorted list with errors and skips, return a nested one.
Expand Down Expand Up @@ -300,27 +345,29 @@ def run(self, doc: etree.Element) -> None:
for el in doc.iter():
if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
self.set_level(el)
text = get_name(el)
html = render_inner_html(copy_element(el), self.md)
text = strip_tags(html)

# Do not override pre-existing ids
if "id" not in el.attrib:
innertext = unescape(stashedHTML2text(text, self.md))
el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids)
el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids)

if 'data-toc-label' in el.attrib:
text = md_unescape(el.attrib['data-toc-label'])
text = run_postprocessors(text, self.md)
text = strip_tags(text)
text = escape_cdata(text)
# Remove the data-toc-label attribute as it is no longer needed
del el.attrib['data-toc-label']

if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
toc_tokens.append({
'level': int(el.tag[-1]),
'id': el.attrib["id"],
'name': unescape(stashedHTML2text(
code_escape(el.attrib.get('data-toc-label', text)),
self.md, strip_entities=False
))
'name': text,
'html': html
})

# Remove the data-toc-label attribute as it is no longer needed
if 'data-toc-label' in el.attrib:
del el.attrib['data-toc-label']

if self.use_anchors:
self.add_anchor(el, el.attrib["id"])
if self.use_permalinks not in [False, None]:
Expand Down
39 changes: 22 additions & 17 deletions tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,9 @@ def testUniqueIds(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'header', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header', 'name': 'Header', 'html': 'Header', 'children': []},
{'level': 1, 'id': 'header_1', 'name': 'Header', 'html': 'Header', 'children': []},
{'level': 1, 'id': 'header_2', 'name': 'Header', 'html': 'Header', 'children': []},
])

def testHtmlEntities(self):
Expand All @@ -441,7 +441,7 @@ def testHtmlEntities(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'children': []},
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'html': 'Foo &amp; bar', 'children': []},
])

def testHtmlSpecialChars(self):
Expand All @@ -460,7 +460,7 @@ def testHtmlSpecialChars(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'children': []},
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'html': 'Foo &gt; &amp; bar', 'children': []},
])

def testRawHtml(self):
Expand All @@ -479,7 +479,7 @@ def testRawHtml(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []},
{'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'html': 'Foo <b>Bar</b> Baz.', 'children': []},
])

def testBaseLevel(self):
Expand Down Expand Up @@ -508,9 +508,9 @@ def testBaseLevel(self):
'</div>\n'
)
self.assertEqual(md.toc_tokens, [
{'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [
{'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []},
{'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []},
{'level': 5, 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', 'children': [
{'level': 6, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', 'children': []},
{'level': 6, 'id': 'too-high', 'name': 'Too High', 'html': 'Too High', 'children': []},
]},
])

Expand All @@ -532,9 +532,13 @@ def testHeaderInlineMarkup(self):
'</ul>\n' # noqa
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []},
])
self.assertEqual(self.md.toc_tokens, [{
'level': 1,
'id': 'some-header-with-markup',
'name': 'Some Header with markup.',
'html': 'Some <em>Header</em> with <a href="http://example.com">markup</a>.',
'children': []
}])

def testTitle(self):
""" Test TOC Title. """
Expand All @@ -549,6 +553,7 @@ def testTitle(self):

def testWithAttrList(self):
""" Test TOC with `attr_list` Extension. """
self.maxDiff = None
md = markdown.Markdown(extensions=['toc', 'attr_list'])
text = ('# Header 1\n\n'
'## Header 2 { #foo }\n\n'
Expand Down Expand Up @@ -580,12 +585,12 @@ def testWithAttrList(self):
'</div>\n'
)
self.assertEqual(md.toc_tokens, [
{'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [
{'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []},
{'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []}
{'level': 1, 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', 'children': [
{'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', 'children': []},
{'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', 'children': []}
]},
{'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'children': []},
{'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []},
{'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'html': 'Header 4', 'children': []},
{'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', 'children': []},
])

def testUniqueFunc(self):
Expand Down
1 change: 1 addition & 0 deletions tests/test_syntax/extensions/test_smarty.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def test_smarty_and_toc(self):
'level': 1,
'id': 'foo-bar',
'name': 'Foo &mdash; bar',
'html': '<em>Foo</em> &mdash; <code>bar</code>',
'children': [],
},
],
Expand Down
Loading

0 comments on commit a8bb59e

Please sign in to comment.