Refactor TOC sanitation

- All postprocessors are run on heading content (not just `RawHtmlPostprocessor`). - Footnote references are stripped from heading content. Fixes Python-Markdown#660. - A more robust `striptags` is provided to convert headings to plain text. Unlike, markupsafe's implementation, HTML entities are not unescaped. - Both the plain text `name` and rich `html` are saved to `toc_tokens`, which means users can now access the full rich text content of the headings directly from the `toc_tokens`. - `data-toc-label` is sanitized separate from heading content. - A `html.unescape` call added to `slugify` and `slugify_unicode`, which ensures `slugify` operates on Unicode characters, rather than HTML entities. By including in the functions, users can override with their own slugify functions if they desire. Note that this first commit includes minimal changes to the tests to show very little change in behavior (mostly the new `html` attribute of the `toc_tokens` was added). A refactoring of the tests will be in a separate commit.
waylan · Feb 9, 2024 · a8bb59e · a8bb59e
1 parent 421f1e8
commit a8bb59e
Show file tree

Hide file tree

Showing 4 changed files with 190 additions and 60 deletions.
diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py
@@ -21,11 +21,14 @@
 
 from . import Extension
 from ..treeprocessors import Treeprocessor
-from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString
+from ..util import parseBoolValue, AMP_SUBSTITUTE
 from ..treeprocessors import UnescapeTreeprocessor
+from ..serializers import RE_AMP
 import re
 import html
 import unicodedata
+from copy import deepcopy
+from html import unescape as html_unescape
 import xml.etree.ElementTree as etree
 from typing import TYPE_CHECKING, Any, Iterator, MutableSet
 
@@ -35,6 +38,8 @@
 
 def slugify(value: str, separator: str, unicode: bool = False) -> str:
     """ Slugify a string, to make it URL friendly. """
+    # First convert HTML entities to Unicode characters
+    value = html_unescape(value)
     if not unicode:
         # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty`
         value = unicodedata.normalize('NFKD', value)
@@ -63,41 +68,81 @@ def unique(id: str, ids: MutableSet[str]) -> str:
     return id
 
 
-def get_name(el: etree.Element) -> str:
-    """Get title name."""
-
-    text = []
-    for c in el.itertext():
-        if isinstance(c, AtomicString):
-            text.append(html.unescape(c))
-        else:
-            text.append(c)
-    return ''.join(text).strip()
-
-
-def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str:
-    """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
-    def _html_sub(m: re.Match[str]) -> str:
-        """ Substitute raw html with plain text. """
-        try:
-            raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
-        except (IndexError, TypeError):  # pragma: no cover
-            return m.group(0)
-        # Strip out tags and/or entities - leaving text
-        res = re.sub(r'(<[^>]+>)', '', raw)
-        if strip_entities:
-            res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)
-        return res
-
-    return HTML_PLACEHOLDER_RE.sub(_html_sub, text)
-
-
-def unescape(text: str) -> str:
-    """ Unescape escaped text. """
+def md_unescape(text: str) -> str:
+    """ Unescape Markdown backslash escaped text. """
     c = UnescapeTreeprocessor()
     return c.unescape(text)
 
 
+def strip_tags(text: str) -> str:
+    """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
+    # A comment could contain a tag, so strip comments first
+    while (start := text.find('<!--')) != -1 and (end := text.find('-->', start)) != -1:
+        text = f'{text[:start]}{text[end + 3:]}'
+
+    while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:
+        text = f'{text[:start]}{text[end + 1:]}'
+
+    # Collapse whitespace
+    text = ' '.join(text.split())
+    return text
+
+
+def escape_cdata(text: str) -> str:
+    """ Escape character data. """
+    if "&" in text:
+        # Only replace & when not part of an entity
+        text = RE_AMP.sub('&amp;', text)
+    if "<" in text:
+        text = text.replace("<", "&lt;")
+    if ">" in text:
+        text = text.replace(">", "&gt;")
+    return text
+
+
+def run_postprocessors(text: str, md: Markdown) -> str:
+    """ Run postprocessors from Markdown instance on text. """
+    for pp in md.postprocessors:
+        text = pp.run(text)
+    return text.strip()
+
+
+def render_inner_html(el: etree.Element, md: Markdown) -> str:
+    """ Fully render inner html of an etree element as a string. """
+    # The UnescapeTreeprocessor runs after TOC so run here.
+    text = md_unescape(md.serializer(el))
+
+    # strip parent tag
+    start = text.index('>') + 1
+    end = text.rindex('<')
+    text = text[start:end].strip()
+
+    return run_postprocessors(text, md)
+
+
+def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element:
+    """ Return a deep copy of an etree element, optionally with footnote references removed. """
+    el = deepcopy(el)
+    # Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.
+    if exclude_fnrefs:
+        for sup in el.findall('sup'):
+            id = sup.get('id', '')
+            if id.startswith('fnref'):
+                # We have a footnote reference. Remove it.
+                parent = el.find(f'.//sup[@id="{id}"]..')
+                if sup.tail:
+                    # Preserve the tail text
+                    siblings = list(parent)
+                    pos = siblings.index(sup)
+                    if pos == 0:
+                        parent.text = f'{parent.text or ""}{sup.tail}'
+                    else:
+                        sibling = siblings[pos - 1]
+                        sibling.tail = f'{sibling.tail or ""}{sup.tail}'
+                parent.remove(sup)
+    return el
+
+
 def nest_toc_tokens(toc_list):
     """Given an unsorted list with errors and skips, return a nested one.
 
@@ -300,27 +345,29 @@ def run(self, doc: etree.Element) -> None:
         for el in doc.iter():
             if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
                 self.set_level(el)
-                text = get_name(el)
+                html = render_inner_html(copy_element(el), self.md)
+                text = strip_tags(html)
 
                 # Do not override pre-existing ids
                 if "id" not in el.attrib:
-                    innertext = unescape(stashedHTML2text(text, self.md))
-                    el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids)
+                    el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids)
+
+                if 'data-toc-label' in el.attrib:
+                    text = md_unescape(el.attrib['data-toc-label'])
+                    text = run_postprocessors(text, self.md)
+                    text = strip_tags(text)
+                    text = escape_cdata(text)
+                    # Remove the data-toc-label attribute as it is no longer needed
+                    del el.attrib['data-toc-label']
 
                 if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
                     toc_tokens.append({
                         'level': int(el.tag[-1]),
                         'id': el.attrib["id"],
-                        'name': unescape(stashedHTML2text(
-                            code_escape(el.attrib.get('data-toc-label', text)),
-                            self.md, strip_entities=False
-                        ))
+                        'name': text,
+                        'html': html
                     })
 
-                # Remove the data-toc-label attribute as it is no longer needed
-                if 'data-toc-label' in el.attrib:
-                    del el.attrib['data-toc-label']
-
                 if self.use_anchors:
                     self.add_anchor(el, el.attrib["id"])
                 if self.use_permalinks not in [False, None]:

diff --git a/tests/test_extensions.py b/tests/test_extensions.py
@@ -420,9 +420,9 @@ def testUniqueIds(self):
             '</div>\n'
         )
         self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'header', 'name': 'Header', 'children': []},
-            {'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []},
-            {'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []},
+            {'level': 1, 'id': 'header', 'name': 'Header', 'html': 'Header', 'children': []},
+            {'level': 1, 'id': 'header_1', 'name': 'Header', 'html': 'Header', 'children': []},
+            {'level': 1, 'id': 'header_2', 'name': 'Header', 'html': 'Header', 'children': []},
         ])
 
     def testHtmlEntities(self):
@@ -441,7 +441,7 @@ def testHtmlEntities(self):
             '</div>\n'
         )
         self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'children': []},
+            {'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'html': 'Foo &amp; bar', 'children': []},
         ])
 
     def testHtmlSpecialChars(self):
@@ -460,7 +460,7 @@ def testHtmlSpecialChars(self):
             '</div>\n'
         )
         self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'children': []},
+            {'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'html': 'Foo &gt; &amp; bar', 'children': []},
         ])
 
     def testRawHtml(self):
@@ -479,7 +479,7 @@ def testRawHtml(self):
             '</div>\n'
         )
         self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []},
+            {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'html': 'Foo <b>Bar</b> Baz.', 'children': []},
         ])
 
     def testBaseLevel(self):
@@ -508,9 +508,9 @@ def testBaseLevel(self):
             '</div>\n'
         )
         self.assertEqual(md.toc_tokens, [
-            {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [
-                {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []},
-                {'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []},
+            {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', 'children': [
+                {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', 'children': []},
+                {'level': 6, 'id': 'too-high', 'name': 'Too High', 'html': 'Too High', 'children': []},
             ]},
         ])
 
@@ -532,9 +532,13 @@ def testHeaderInlineMarkup(self):
               '</ul>\n'                                    # noqa
             '</div>\n'
         )
-        self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []},
-        ])
+        self.assertEqual(self.md.toc_tokens, [{
+            'level': 1,
+            'id': 'some-header-with-markup',
+            'name': 'Some Header with markup.',
+            'html': 'Some <em>Header</em> with <a href="http://example.com">markup</a>.',
+            'children': []
+        }])
 
     def testTitle(self):
         """ Test TOC Title. """
@@ -549,6 +553,7 @@ def testTitle(self):
 
     def testWithAttrList(self):
         """ Test TOC with `attr_list` Extension. """
+        self.maxDiff = None
         md = markdown.Markdown(extensions=['toc', 'attr_list'])
         text = ('# Header 1\n\n'
                 '## Header 2 { #foo }\n\n'
@@ -580,12 +585,12 @@ def testWithAttrList(self):
             '</div>\n'
         )
         self.assertEqual(md.toc_tokens, [
-            {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [
-                {'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []},
-                {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []}
+            {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', 'children': [
+                {'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', 'children': []},
+                {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', 'children': []}
             ]},
-            {'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'children': []},
-            {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []},
+            {'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'html': 'Header 4', 'children': []},
+            {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', 'children': []},
         ])
 
     def testUniqueFunc(self):

diff --git a/tests/test_syntax/extensions/test_smarty.py b/tests/test_syntax/extensions/test_smarty.py
@@ -216,6 +216,7 @@ def test_smarty_and_toc(self):
                         'level': 1,
                         'id': 'foo-bar',
                         'name': 'Foo &mdash; bar',
+                        'html': '<em>Foo</em> &mdash; <code>bar</code>',
                         'children': [],
                     },
                 ],