Abbr Extension: Definition Sorting and Glossary storage

* `AbbrTreeprocessor` now sorts the abbreviation list by length before processing the element tree This ensures that multi-word abbreviations are implemented even if an abbreviation exists for one of those component words. * Added handling for empty abbreviations. * Added a `glossary` option for the `abbr` extension which accepts a dictionary.
Python-Markdown · Jun 11, 2024 · 33359fa · 33359fa
1 parent ec8c305
commit 33359fa
Show file tree

Hide file tree

Showing 4 changed files with 212 additions and 16 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -23,6 +23,21 @@ better reflects what it is. `AbbrPreprocessor` has been deprecated.
 
 A call to `Markdown.reset()` now clears all previously defined abbreviations.
 
+Abbreviations are now sorted by length before executing `AbbrTreeprocessor`
+to ensure that multi-word abbreviations are implemented even if an abbreviation
+exists for one of those component words. (#1465)
+
+Abbreviations without a definition are now ignored. This avoids applying
+abbr tags to text without a title value.
+
+Added an optional `glossary` configuration option to the abbreviations extension.
+This provides a simple and efficient way to apply a dictionary of abbreviations
+to every page.
+
+Abbreviations can now be disabled by setting their definition to `""` or `''`.
+This can be useful when using the `glossary` option.
+
+
 ### Fixed
 
 * Fixed links to source code on GitHub from the documentation (#1453).

diff --git a/docs/extensions/abbreviations.md b/docs/extensions/abbreviations.md
@@ -46,10 +46,25 @@ Usage
 See [Extensions](index.md) for general extension usage. Use `abbr` as the name
 of the extension.
 
-This extension does not accept any special configuration options.
+The following options are provided to configure the output:
+
+* **`glossary`**:
+    A dictionary where the `key` is the abbreviation and the `value` is the definition.
 
 A trivial example:
 
 ```python
 markdown.markdown(some_text, extensions=['abbr'])
 ```
+
+Disabling Abbreviations
+-----------------------
+
+When using the `glossary` option, there may be times when you need to turn off
+a specific abbreviation. To do this, set the abbreviation to `''` or `""`.
+
+```md
+The HTML abbreviation is disabled on this page.
+
+*[HTML]: ''
+```
diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py
@@ -41,15 +41,38 @@ class AbbrExtension(Extension):
 
     def __init__(self, **kwargs):
         """ Initiate Extension and set up configs. """
+        self.config = {
+            'glossary': [
+                {},
+                'A dictionary where the `key` is the abbreviation and the `value` is the definition.'
+                "Default: `{}`"
+            ],
+        }
+        """ Default configuration options. """
         super().__init__(**kwargs)
         self.abbrs = {}
+        self.glossary = {}
 
     def reset(self):
         """ Clear all previously defined abbreviations. """
         self.abbrs.clear()
+        if (self.glossary):
+            self.abbrs.update(self.glossary)
+
+    def reset_glossary(self):
+        """ Clear all abbreviations from the glossary. """
+        self.glossary.clear()
+
+    def load_glossary(self, dictionary: dict[str, str]):
+        """Adds `dictionary` to our glossary. Any abbreviations that already exist will be overwritten."""
+        if dictionary:
+            self.glossary = {**dictionary, **self.glossary}
 
     def extendMarkdown(self, md):
         """ Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
+        if (self.config['glossary'][0]):
+            self.load_glossary(self.config['glossary'][0])
+        self.abbrs.update(self.glossary)
         md.registerExtension(self)
         md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
         md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16)
@@ -69,13 +92,14 @@ def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -
             self.iter_element(child, el)
         if text := el.text:
             for m in reversed(list(self.RE.finditer(text))):
-                abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
-                abbr.text = AtomicString(m.group(0))
-                abbr.tail = text[m.end():]
-                el.insert(0, abbr)
-                text = text[:m.start()]
+                if self.abbrs[m.group(0)]:
+                    abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
+                    abbr.text = AtomicString(m.group(0))
+                    abbr.tail = text[m.end():]
+                    el.insert(0, abbr)
+                    text = text[:m.start()]
             el.text = text
-        if parent and el.tail:
+        if parent is not None and el.tail:
             tail = el.tail
             index = list(parent).index(el) + 1
             for m in reversed(list(self.RE.finditer(tail))):
@@ -92,7 +116,9 @@ def run(self, root: etree.Element) -> etree.Element | None:
             # No abbreviations defined. Skip running processor.
             return
         # Build and compile regex
-        self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b")
+        abbr_list = list(self.abbrs.keys())
+        abbr_list.sort(key=len, reverse=True)
+        self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in abbr_list) })\\b")
         # Step through tree and modify on matches
         self.iter_element(root)
 
@@ -120,14 +146,18 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
         if m:
             abbr = m.group('abbr').strip()
             title = m.group('title').strip()
-            self.abbrs[abbr] = title
-            if block[m.end():].strip():
-                # Add any content after match back to blocks as separate block
-                blocks.insert(0, block[m.end():].lstrip('\n'))
-            if block[:m.start()].strip():
-                # Add any content before match back to blocks as separate block
-                blocks.insert(0, block[:m.start()].rstrip('\n'))
-            return True
+            if title and abbr:
+                if title == "''" or title == '""':
+                    self.abbrs.pop(abbr)
+                else:
+                    self.abbrs[abbr] = title
+                if block[m.end():].strip():
+                    # Add any content after match back to blocks as separate block
+                    blocks.insert(0, block[m.end():].lstrip('\n'))
+                if block[:m.start()].strip():
+                    # Add any content before match back to blocks as separate block
+                    blocks.insert(0, block[:m.start()].rstrip('\n'))
+                return True
         # No match. Restore block.
         blocks.insert(0, block)
         return False

diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py
@@ -136,6 +136,69 @@ def test_abbr_override(self):
             )
         )
 
+    def test_abbr_glossary(self):
+
+        glossary = {
+            "ABBR": "Abbreviation",
+            "abbr": "Abbreviation",
+            "HTML": "Hyper Text Markup Language",
+            "W3C": "World Wide Web Consortium"
+        }
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                ABBR
+                abbr
+
+                HTML
+                W3C
+                """
+            ),
+            self.dedent(
+                """
+                <p><abbr title="Abbreviation">ABBR</abbr>
+                <abbr title="Abbreviation">abbr</abbr></p>
+                <p><abbr title="Hyper Text Markup Language">HTML</abbr>
+                <abbr title="World Wide Web Consortium">W3C</abbr></p>
+                """
+            ),
+            extensions=[AbbrExtension(glossary=glossary)]
+        )
+
+    def test_abbr_glossary_2(self):
+
+        glossary = {
+            "ABBR": "Abbreviation",
+            "abbr": "Abbreviation",
+            "HTML": "Hyper Text Markup Language",
+            "W3C": "World Wide Web Consortium"
+        }
+
+        glossary_2 = {
+            "ABBR": "New Abbreviation"
+        }
+
+        abbr_ext = AbbrExtension(glossary=glossary)
+        abbr_ext.load_glossary(glossary_2)
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                ABBR abbr HTML W3C
+                """
+            ),
+            self.dedent(
+                """
+                <p><abbr title="New Abbreviation">ABBR</abbr> """
+                + """<abbr title="Abbreviation">abbr</abbr> """
+                + """<abbr title="Hyper Text Markup Language">HTML</abbr> """
+                + """<abbr title="World Wide Web Consortium">W3C</abbr></p>
+                """
+            ),
+            extensions=[abbr_ext]
+        )
+
     def test_abbr_nested(self):
         self.assertMarkdownRenders(
             self.dedent(
@@ -383,6 +446,79 @@ def test_abbr_with_attr_list(self):
             extensions=['abbr', 'attr_list']
         )
 
+    def test_abbr_superset_vs_subset(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                abbr, SS, and abbr-SS should have different definitions.
+
+                *[abbr]: Abbreviation Definition
+                *[abbr-SS]: Abbreviation Superset Definition
+                *[SS]: Superset Definition
+                """
+            ),
+            self.dedent(
+                """
+                <p><abbr title="Abbreviation Definition">abbr</abbr>, """
+                + """<abbr title="Superset Definition">SS</abbr>, """
+                + """and <abbr title="Abbreviation Superset Definition">abbr-SS</abbr> """
+                + """should have different definitions.</p>
+                """
+            )
+        )
+
+    def test_abbr_empty(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *[abbr]:
+                Abbreviation Definition
+
+                abbr
+
+                *[]: Empty
+
+                *[ ]: Empty
+
+                *[abbr]:
+
+                *[ABBR]:
+
+                Testing document text.
+                """
+            ),
+            self.dedent(
+                """
+                <p><abbr title="Abbreviation Definition">abbr</abbr></p>\n"""
+                + """<p>*[]: Empty</p>\n"""
+                + """<p>*[ ]: Empty</p>\n"""
+                + """<p>*[<abbr title="Abbreviation Definition">abbr</abbr>]:</p>\n"""
+                + """<p>*[ABBR]:</p>\n"""
+                + """<p>Testing document text.</p>
+                """
+            )
+        )
+
+    def test_abbr_clear(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *[abbr]: Abbreviation Definition
+                *[ABBR]: Abbreviation Definition
+
+                abbr ABBR
+
+                *[abbr]: ""
+                *[ABBR]: ''
+                """
+            ),
+            self.dedent(
+                """
+                <p>abbr ABBR</p>
+                """
+            )
+        )
+
     def test_abbr_reset(self):
         ext = AbbrExtension()
         md = Markdown(extensions=[ext])