diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py
index a17d7241..e7cde03e 100644
--- a/markdown/extensions/toc.py
+++ b/markdown/extensions/toc.py
@@ -21,11 +21,14 @@
from . import Extension
from ..treeprocessors import Treeprocessor
-from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString
+from ..util import parseBoolValue, AMP_SUBSTITUTE
from ..treeprocessors import UnescapeTreeprocessor
+from ..serializers import RE_AMP
import re
import html
import unicodedata
+from copy import deepcopy
+from html import unescape as html_unescape
import xml.etree.ElementTree as etree
from typing import TYPE_CHECKING, Any, Iterator, MutableSet
@@ -35,6 +38,8 @@
def slugify(value: str, separator: str, unicode: bool = False) -> str:
""" Slugify a string, to make it URL friendly. """
+ # First convert HTML entities to Unicode characters
+ value = html_unescape(value)
if not unicode:
# Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty`
value = unicodedata.normalize('NFKD', value)
@@ -63,41 +68,81 @@ def unique(id: str, ids: MutableSet[str]) -> str:
return id
-def get_name(el: etree.Element) -> str:
- """Get title name."""
-
- text = []
- for c in el.itertext():
- if isinstance(c, AtomicString):
- text.append(html.unescape(c))
- else:
- text.append(c)
- return ''.join(text).strip()
-
-
-def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str:
- """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
- def _html_sub(m: re.Match[str]) -> str:
- """ Substitute raw html with plain text. """
- try:
- raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
- except (IndexError, TypeError): # pragma: no cover
- return m.group(0)
- # Strip out tags and/or entities - leaving text
- res = re.sub(r'(<[^>]+>)', '', raw)
- if strip_entities:
- res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)
- return res
-
- return HTML_PLACEHOLDER_RE.sub(_html_sub, text)
-
-
-def unescape(text: str) -> str:
- """ Unescape escaped text. """
+def md_unescape(text: str) -> str:
+ """ Unescape Markdown backslash escaped text. """
c = UnescapeTreeprocessor()
return c.unescape(text)
+def strip_tags(text: str) -> str:
+ """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
+ # A comment could contain a tag, so strip comments first
+ while (start := text.find('', start)) != -1:
+ text = f'{text[:start]}{text[end + 3:]}'
+
+ while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:
+ text = f'{text[:start]}{text[end + 1:]}'
+
+ # Collapse whitespace
+ text = ' '.join(text.split())
+ return text
+
+
+def escape_cdata(text: str) -> str:
+ """ Escape character data. """
+ if "&" in text:
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&', text)
+ if "<" in text:
+ text = text.replace("<", "<")
+ if ">" in text:
+ text = text.replace(">", ">")
+ return text
+
+
+def run_postprocessors(text: str, md: Markdown) -> str:
+ """ Run postprocessors from Markdown instance on text. """
+ for pp in md.postprocessors:
+ text = pp.run(text)
+ return text.strip()
+
+
+def render_inner_html(el: etree.Element, md: Markdown) -> str:
+ """ Fully render inner html of an etree element as a string. """
+ # The UnescapeTreeprocessor runs after TOC so run here.
+ text = md_unescape(md.serializer(el))
+
+ # strip parent tag
+ start = text.index('>') + 1
+ end = text.rindex('<')
+ text = text[start:end].strip()
+
+ return run_postprocessors(text, md)
+
+
+def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element:
+ """ Return a deep copy of an etree element, optionally with footnote references removed. """
+ el = deepcopy(el)
+ # Remove footnote references, which look like this: `...`.
+ if exclude_fnrefs:
+ for sup in el.findall('sup'):
+ id = sup.get('id', '')
+ if id.startswith('fnref'):
+ # We have a footnote reference. Remove it.
+ parent = el.find(f'.//sup[@id="{id}"]..')
+ if sup.tail:
+ # Preserve the tail text
+ siblings = list(parent)
+ pos = siblings.index(sup)
+ if pos == 0:
+ parent.text = f'{parent.text or ""}{sup.tail}'
+ else:
+ sibling = siblings[pos - 1]
+ sibling.tail = f'{sibling.tail or ""}{sup.tail}'
+ parent.remove(sup)
+ return el
+
+
def nest_toc_tokens(toc_list):
"""Given an unsorted list with errors and skips, return a nested one.
@@ -300,27 +345,29 @@ def run(self, doc: etree.Element) -> None:
for el in doc.iter():
if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
self.set_level(el)
- text = get_name(el)
+ html = render_inner_html(copy_element(el), self.md)
+ text = strip_tags(html)
# Do not override pre-existing ids
if "id" not in el.attrib:
- innertext = unescape(stashedHTML2text(text, self.md))
- el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids)
+ el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids)
+
+ if 'data-toc-label' in el.attrib:
+ text = md_unescape(el.attrib['data-toc-label'])
+ text = run_postprocessors(text, self.md)
+ text = strip_tags(text)
+ text = escape_cdata(text)
+ # Remove the data-toc-label attribute as it is no longer needed
+ del el.attrib['data-toc-label']
if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
toc_tokens.append({
'level': int(el.tag[-1]),
'id': el.attrib["id"],
- 'name': unescape(stashedHTML2text(
- code_escape(el.attrib.get('data-toc-label', text)),
- self.md, strip_entities=False
- ))
+ 'name': text,
+ 'html': html
})
- # Remove the data-toc-label attribute as it is no longer needed
- if 'data-toc-label' in el.attrib:
- del el.attrib['data-toc-label']
-
if self.use_anchors:
self.add_anchor(el, el.attrib["id"])
if self.use_permalinks not in [False, None]:
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
index a9e789f1..4ebe4eca 100644
--- a/tests/test_extensions.py
+++ b/tests/test_extensions.py
@@ -420,9 +420,9 @@ def testUniqueIds(self):
'\n'
)
self.assertEqual(self.md.toc_tokens, [
- {'level': 1, 'id': 'header', 'name': 'Header', 'children': []},
- {'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []},
- {'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []},
+ {'level': 1, 'id': 'header', 'name': 'Header', 'html': 'Header', 'children': []},
+ {'level': 1, 'id': 'header_1', 'name': 'Header', 'html': 'Header', 'children': []},
+ {'level': 1, 'id': 'header_2', 'name': 'Header', 'html': 'Header', 'children': []},
])
def testHtmlEntities(self):
@@ -441,7 +441,7 @@ def testHtmlEntities(self):
'\n'
)
self.assertEqual(self.md.toc_tokens, [
- {'level': 1, 'id': 'foo-bar', 'name': 'Foo & bar', 'children': []},
+ {'level': 1, 'id': 'foo-bar', 'name': 'Foo & bar', 'html': 'Foo & bar', 'children': []},
])
def testHtmlSpecialChars(self):
@@ -460,7 +460,7 @@ def testHtmlSpecialChars(self):
'\n'
)
self.assertEqual(self.md.toc_tokens, [
- {'level': 1, 'id': 'foo-bar', 'name': 'Foo > & bar', 'children': []},
+ {'level': 1, 'id': 'foo-bar', 'name': 'Foo > & bar', 'html': 'Foo > & bar', 'children': []},
])
def testRawHtml(self):
@@ -479,7 +479,7 @@ def testRawHtml(self):
'\n'
)
self.assertEqual(self.md.toc_tokens, [
- {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []},
+ {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'html': 'Foo Bar Baz.', 'children': []},
])
def testBaseLevel(self):
@@ -508,9 +508,9 @@ def testBaseLevel(self):
'\n'
)
self.assertEqual(md.toc_tokens, [
- {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [
- {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []},
- {'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []},
+ {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', 'children': [
+ {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', 'children': []},
+ {'level': 6, 'id': 'too-high', 'name': 'Too High', 'html': 'Too High', 'children': []},
]},
])
@@ -532,9 +532,13 @@ def testHeaderInlineMarkup(self):
'\n' # noqa
'\n'
)
- self.assertEqual(self.md.toc_tokens, [
- {'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []},
- ])
+ self.assertEqual(self.md.toc_tokens, [{
+ 'level': 1,
+ 'id': 'some-header-with-markup',
+ 'name': 'Some Header with markup.',
+ 'html': 'Some Header with markup.',
+ 'children': []
+ }])
def testTitle(self):
""" Test TOC Title. """
@@ -549,6 +553,7 @@ def testTitle(self):
def testWithAttrList(self):
""" Test TOC with `attr_list` Extension. """
+ self.maxDiff = None
md = markdown.Markdown(extensions=['toc', 'attr_list'])
text = ('# Header 1\n\n'
'## Header 2 { #foo }\n\n'
@@ -580,12 +585,12 @@ def testWithAttrList(self):
'\n'
)
self.assertEqual(md.toc_tokens, [
- {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [
- {'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []},
- {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []}
+ {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', 'children': [
+ {'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', 'children': []},
+ {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', 'children': []}
]},
- {'level': 1, 'id': 'header-4', 'name': 'Foo > Baz', 'children': []},
- {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []},
+ {'level': 1, 'id': 'header-4', 'name': 'Foo > Baz', 'html': 'Header 4', 'children': []},
+ {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', 'children': []},
])
def testUniqueFunc(self):
diff --git a/tests/test_syntax/extensions/test_smarty.py b/tests/test_syntax/extensions/test_smarty.py
index 8a176745..d2d9b094 100644
--- a/tests/test_syntax/extensions/test_smarty.py
+++ b/tests/test_syntax/extensions/test_smarty.py
@@ -216,6 +216,7 @@ def test_smarty_and_toc(self):
'level': 1,
'id': 'foo-bar',
'name': 'Foo — bar',
+ 'html': 'Foo — bar
',
'children': [],
},
],
diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py
index 79764364..ff5a1774 100644
--- a/tests/test_syntax/extensions/test_toc.py
+++ b/tests/test_syntax/extensions/test_toc.py
@@ -140,11 +140,13 @@ def testMinMaxLevel(self):
'level': 3,
'id': 'header-3',
'name': 'Header 3',
+ 'html': 'Header 3',
'children': [
{
'level': 4,
'id': 'header-4',
'name': 'Header 4',
+ 'html': 'Header 4',
'children': []
}
]
@@ -189,11 +191,13 @@ def testMaxLevel(self):
'level': 1,
'id': 'header-1',
'name': 'Header 1',
+ 'html': 'Header 1',
'children': [
{
'level': 2,
'id': 'header-2',
'name': 'Header 2',
+ 'html': 'Header 2',
'children': []
}
]
@@ -245,11 +249,13 @@ def testMinMaxLevelwithAnchorLink(self):
'level': 3,
'id': 'header-3',
'name': 'Header 3',
+ 'html': 'Header 3',
'children': [
{
'level': 4,
'id': 'header-4',
'name': 'Header 4',
+ 'html': 'Header 4',
'children': []
}
]
@@ -301,11 +307,13 @@ def testMinMaxLevelwithPermalink(self):
'level': 3,
'id': 'header-3',
'name': 'Header 3',
+ 'html': 'Header 3',
'children': [
{
'level': 4,
'id': 'header-4',
'name': 'Header 4',
+ 'html': 'Header 4',
'children': []
}
]
@@ -353,11 +361,13 @@ def testMinMaxLevelwithBaseLevel(self):
'level': 4,
'id': 'second-level',
'name': 'Second Level',
+ 'html': 'Second Level',
'children': [
{
'level': 5,
'id': 'third-level',
'name': 'Third Level',
+ 'html': 'Third Level',
'children': []
}
]
@@ -402,11 +412,13 @@ def testMaxLevelwithBaseLevel(self):
'level': 2,
'id': 'some-header',
'name': 'Some Header',
+ 'html': 'Some Header',
'children': [
{
'level': 3,
'id': 'next-level',
'name': 'Next Level',
+ 'html': 'Next Level',
'children': []
}
]
@@ -455,6 +467,7 @@ def test_escaped_char_in_id(self):
'level': 1,
'id': 'escaped_character',
'name': 'escaped_character',
+ 'html': 'escaped_character',
'children': []
}
]
@@ -671,3 +684,67 @@ def testTOCWithCustomTitleClass(self):
),
extensions=[TocExtension(title_class="tocname", title='ToC')]
)
+
+ def testHeadingRemoveFootnoteRef(self):
+
+ self.assertMarkdownRenders(
+ self.dedent(
+ '''
+ # Header 1[^1]
+ # Header[^1] 2
+ # Header *subelement*[^1] 3
+
+ [^1]: footnote
+ '''
+ ),
+ self.dedent(
+ '''
+