diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index a17d7241..e7cde03e 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -21,11 +21,14 @@ from . import Extension from ..treeprocessors import Treeprocessor -from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString +from ..util import parseBoolValue, AMP_SUBSTITUTE from ..treeprocessors import UnescapeTreeprocessor +from ..serializers import RE_AMP import re import html import unicodedata +from copy import deepcopy +from html import unescape as html_unescape import xml.etree.ElementTree as etree from typing import TYPE_CHECKING, Any, Iterator, MutableSet @@ -35,6 +38,8 @@ def slugify(value: str, separator: str, unicode: bool = False) -> str: """ Slugify a string, to make it URL friendly. """ + # First convert HTML entities to Unicode characters + value = html_unescape(value) if not unicode: # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty` value = unicodedata.normalize('NFKD', value) @@ -63,41 +68,81 @@ def unique(id: str, ids: MutableSet[str]) -> str: return id -def get_name(el: etree.Element) -> str: - """Get title name.""" - - text = [] - for c in el.itertext(): - if isinstance(c, AtomicString): - text.append(html.unescape(c)) - else: - text.append(c) - return ''.join(text).strip() - - -def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str: - """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """ - def _html_sub(m: re.Match[str]) -> str: - """ Substitute raw html with plain text. """ - try: - raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))] - except (IndexError, TypeError): # pragma: no cover - return m.group(0) - # Strip out tags and/or entities - leaving text - res = re.sub(r'(<[^>]+>)', '', raw) - if strip_entities: - res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res) - return res - - return HTML_PLACEHOLDER_RE.sub(_html_sub, text) - - -def unescape(text: str) -> str: - """ Unescape escaped text. """ +def md_unescape(text: str) -> str: + """ Unescape Markdown backslash escaped text. """ c = UnescapeTreeprocessor() return c.unescape(text) +def strip_tags(text: str) -> str: + """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """ + # A comment could contain a tag, so strip comments first + while (start := text.find('', start)) != -1: + text = f'{text[:start]}{text[end + 3:]}' + + while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1: + text = f'{text[:start]}{text[end + 1:]}' + + # Collapse whitespace + text = ' '.join(text.split()) + return text + + +def escape_cdata(text: str) -> str: + """ Escape character data. """ + if "&" in text: + # Only replace & when not part of an entity + text = RE_AMP.sub('&', text) + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + return text + + +def run_postprocessors(text: str, md: Markdown) -> str: + """ Run postprocessors from Markdown instance on text. """ + for pp in md.postprocessors: + text = pp.run(text) + return text.strip() + + +def render_inner_html(el: etree.Element, md: Markdown) -> str: + """ Fully render inner html of an etree element as a string. """ + # The UnescapeTreeprocessor runs after TOC so run here. + text = md_unescape(md.serializer(el)) + + # strip parent tag + start = text.index('>') + 1 + end = text.rindex('<') + text = text[start:end].strip() + + return run_postprocessors(text, md) + + +def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element: + """ Return a deep copy of an etree element, optionally with footnote references removed. """ + el = deepcopy(el) + # Remove footnote references, which look like this: `...`. + if exclude_fnrefs: + for sup in el.findall('sup'): + id = sup.get('id', '') + if id.startswith('fnref'): + # We have a footnote reference. Remove it. + parent = el.find(f'.//sup[@id="{id}"]..') + if sup.tail: + # Preserve the tail text + siblings = list(parent) + pos = siblings.index(sup) + if pos == 0: + parent.text = f'{parent.text or ""}{sup.tail}' + else: + sibling = siblings[pos - 1] + sibling.tail = f'{sibling.tail or ""}{sup.tail}' + parent.remove(sup) + return el + + def nest_toc_tokens(toc_list): """Given an unsorted list with errors and skips, return a nested one. @@ -300,27 +345,29 @@ def run(self, doc: etree.Element) -> None: for el in doc.iter(): if isinstance(el.tag, str) and self.header_rgx.match(el.tag): self.set_level(el) - text = get_name(el) + html = render_inner_html(copy_element(el), self.md) + text = strip_tags(html) # Do not override pre-existing ids if "id" not in el.attrib: - innertext = unescape(stashedHTML2text(text, self.md)) - el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids) + el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids) + + if 'data-toc-label' in el.attrib: + text = md_unescape(el.attrib['data-toc-label']) + text = run_postprocessors(text, self.md) + text = strip_tags(text) + text = escape_cdata(text) + # Remove the data-toc-label attribute as it is no longer needed + del el.attrib['data-toc-label'] if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom: toc_tokens.append({ 'level': int(el.tag[-1]), 'id': el.attrib["id"], - 'name': unescape(stashedHTML2text( - code_escape(el.attrib.get('data-toc-label', text)), - self.md, strip_entities=False - )) + 'name': text, + 'html': html }) - # Remove the data-toc-label attribute as it is no longer needed - if 'data-toc-label' in el.attrib: - del el.attrib['data-toc-label'] - if self.use_anchors: self.add_anchor(el, el.attrib["id"]) if self.use_permalinks not in [False, None]: diff --git a/tests/test_extensions.py b/tests/test_extensions.py index a9e789f1..4ebe4eca 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -420,9 +420,9 @@ def testUniqueIds(self): '\n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'header', 'name': 'Header', 'children': []}, - {'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []}, - {'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []}, + {'level': 1, 'id': 'header', 'name': 'Header', 'html': 'Header', 'children': []}, + {'level': 1, 'id': 'header_1', 'name': 'Header', 'html': 'Header', 'children': []}, + {'level': 1, 'id': 'header_2', 'name': 'Header', 'html': 'Header', 'children': []}, ]) def testHtmlEntities(self): @@ -441,7 +441,7 @@ def testHtmlEntities(self): '\n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar', 'name': 'Foo & bar', 'children': []}, + {'level': 1, 'id': 'foo-bar', 'name': 'Foo & bar', 'html': 'Foo & bar', 'children': []}, ]) def testHtmlSpecialChars(self): @@ -460,7 +460,7 @@ def testHtmlSpecialChars(self): '\n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar', 'name': 'Foo > & bar', 'children': []}, + {'level': 1, 'id': 'foo-bar', 'name': 'Foo > & bar', 'html': 'Foo > & bar', 'children': []}, ]) def testRawHtml(self): @@ -479,7 +479,7 @@ def testRawHtml(self): '\n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []}, + {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'html': 'Foo Bar Baz.', 'children': []}, ]) def testBaseLevel(self): @@ -508,9 +508,9 @@ def testBaseLevel(self): '\n' ) self.assertEqual(md.toc_tokens, [ - {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [ - {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []}, - {'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []}, + {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', 'children': [ + {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', 'children': []}, + {'level': 6, 'id': 'too-high', 'name': 'Too High', 'html': 'Too High', 'children': []}, ]}, ]) @@ -532,9 +532,13 @@ def testHeaderInlineMarkup(self): '\n' # noqa '\n' ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []}, - ]) + self.assertEqual(self.md.toc_tokens, [{ + 'level': 1, + 'id': 'some-header-with-markup', + 'name': 'Some Header with markup.', + 'html': 'Some Header with markup.', + 'children': [] + }]) def testTitle(self): """ Test TOC Title. """ @@ -549,6 +553,7 @@ def testTitle(self): def testWithAttrList(self): """ Test TOC with `attr_list` Extension. """ + self.maxDiff = None md = markdown.Markdown(extensions=['toc', 'attr_list']) text = ('# Header 1\n\n' '## Header 2 { #foo }\n\n' @@ -580,12 +585,12 @@ def testWithAttrList(self): '\n' ) self.assertEqual(md.toc_tokens, [ - {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [ - {'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []}, - {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []} + {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', 'children': [ + {'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', 'children': []}, + {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', 'children': []} ]}, - {'level': 1, 'id': 'header-4', 'name': 'Foo > Baz', 'children': []}, - {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []}, + {'level': 1, 'id': 'header-4', 'name': 'Foo > Baz', 'html': 'Header 4', 'children': []}, + {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', 'children': []}, ]) def testUniqueFunc(self): diff --git a/tests/test_syntax/extensions/test_smarty.py b/tests/test_syntax/extensions/test_smarty.py index 8a176745..d2d9b094 100644 --- a/tests/test_syntax/extensions/test_smarty.py +++ b/tests/test_syntax/extensions/test_smarty.py @@ -216,6 +216,7 @@ def test_smarty_and_toc(self): 'level': 1, 'id': 'foo-bar', 'name': 'Foo — bar', + 'html': 'Foobar', 'children': [], }, ], diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 79764364..ff5a1774 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -140,11 +140,13 @@ def testMinMaxLevel(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', 'children': [] } ] @@ -189,11 +191,13 @@ def testMaxLevel(self): 'level': 1, 'id': 'header-1', 'name': 'Header 1', + 'html': 'Header 1', 'children': [ { 'level': 2, 'id': 'header-2', 'name': 'Header 2', + 'html': 'Header 2', 'children': [] } ] @@ -245,11 +249,13 @@ def testMinMaxLevelwithAnchorLink(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', 'children': [] } ] @@ -301,11 +307,13 @@ def testMinMaxLevelwithPermalink(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', 'children': [] } ] @@ -353,11 +361,13 @@ def testMinMaxLevelwithBaseLevel(self): 'level': 4, 'id': 'second-level', 'name': 'Second Level', + 'html': 'Second Level', 'children': [ { 'level': 5, 'id': 'third-level', 'name': 'Third Level', + 'html': 'Third Level', 'children': [] } ] @@ -402,11 +412,13 @@ def testMaxLevelwithBaseLevel(self): 'level': 2, 'id': 'some-header', 'name': 'Some Header', + 'html': 'Some Header', 'children': [ { 'level': 3, 'id': 'next-level', 'name': 'Next Level', + 'html': 'Next Level', 'children': [] } ] @@ -455,6 +467,7 @@ def test_escaped_char_in_id(self): 'level': 1, 'id': 'escaped_character', 'name': 'escaped_character', + 'html': 'escaped_character', 'children': [] } ] @@ -671,3 +684,67 @@ def testTOCWithCustomTitleClass(self): ), extensions=[TocExtension(title_class="tocname", title='ToC')] ) + + def testHeadingRemoveFootnoteRef(self): + + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1[^1] + # Header[^1] 2 + # Header *subelement*[^1] 3 + + [^1]: footnote + ''' + ), + self.dedent( + ''' +

Header 11

+

Header1 2

+

Header subelement1 3

+
+
+
    +
  1. +

    footnote 

    +
  2. +
+
+ ''' + ), + expected_attrs={ + 'toc': ( + '
\n' + '\n' # noqa + '
\n' # noqa + ), + 'toc_tokens': [ + { + 'level': 1, + 'id': 'header-1', + 'name': 'Header 1', + 'html': 'Header 1', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-2', + 'name': 'Header 2', + 'html': 'Header 2', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-subelement-3', + 'name': 'Header subelement 3', + 'html': 'Header subelement 3', + 'children': [] + } + ] + }, + extensions=[TocExtension(), 'footnotes'] + )