Skip to content

Commit

Permalink
Abbr Extension: Definition Sorting and Glossary storage
Browse files Browse the repository at this point in the history
* `AbbrTreeprocessor` now sorts the abbreviation
list by length before processing the element tree

This ensures that multi-word abbreviations are
implemented even if an abbreviation exists for
one of those component words.

* Added handling for empty abbreviations.

* Added a `glossary` option for the `abbr` extension
which accepts a dictionary.
  • Loading branch information
nbanyan committed Jun 11, 2024
1 parent ec8c305 commit 33359fa
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 16 deletions.
15 changes: 15 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@ better reflects what it is. `AbbrPreprocessor` has been deprecated.

A call to `Markdown.reset()` now clears all previously defined abbreviations.

Abbreviations are now sorted by length before executing `AbbrTreeprocessor`
to ensure that multi-word abbreviations are implemented even if an abbreviation
exists for one of those component words. (#1465)

Abbreviations without a definition are now ignored. This avoids applying
abbr tags to text without a title value.

Added an optional `glossary` configuration option to the abbreviations extension.
This provides a simple and efficient way to apply a dictionary of abbreviations
to every page.

Abbreviations can now be disabled by setting their definition to `""` or `''`.
This can be useful when using the `glossary` option.


### Fixed

* Fixed links to source code on GitHub from the documentation (#1453).
Expand Down
17 changes: 16 additions & 1 deletion docs/extensions/abbreviations.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,25 @@ Usage
See [Extensions](index.md) for general extension usage. Use `abbr` as the name
of the extension.

This extension does not accept any special configuration options.
The following options are provided to configure the output:

* **`glossary`**:
A dictionary where the `key` is the abbreviation and the `value` is the definition.

A trivial example:

```python
markdown.markdown(some_text, extensions=['abbr'])
```

Disabling Abbreviations
-----------------------

When using the `glossary` option, there may be times when you need to turn off
a specific abbreviation. To do this, set the abbreviation to `''` or `""`.

```md
The HTML abbreviation is disabled on this page.

*[HTML]: ''
```
60 changes: 45 additions & 15 deletions markdown/extensions/abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,38 @@ class AbbrExtension(Extension):

def __init__(self, **kwargs):
""" Initiate Extension and set up configs. """
self.config = {
'glossary': [
{},
'A dictionary where the `key` is the abbreviation and the `value` is the definition.'
"Default: `{}`"
],
}
""" Default configuration options. """
super().__init__(**kwargs)
self.abbrs = {}
self.glossary = {}

def reset(self):
""" Clear all previously defined abbreviations. """
self.abbrs.clear()
if (self.glossary):
self.abbrs.update(self.glossary)

def reset_glossary(self):
""" Clear all abbreviations from the glossary. """
self.glossary.clear()

def load_glossary(self, dictionary: dict[str, str]):
"""Adds `dictionary` to our glossary. Any abbreviations that already exist will be overwritten."""
if dictionary:
self.glossary = {**dictionary, **self.glossary}

def extendMarkdown(self, md):
""" Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
if (self.config['glossary'][0]):
self.load_glossary(self.config['glossary'][0])
self.abbrs.update(self.glossary)
md.registerExtension(self)
md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16)
Expand All @@ -69,13 +92,14 @@ def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -
self.iter_element(child, el)
if text := el.text:
for m in reversed(list(self.RE.finditer(text))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
if self.abbrs[m.group(0)]:
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
el.text = text
if parent and el.tail:
if parent is not None and el.tail:
tail = el.tail
index = list(parent).index(el) + 1
for m in reversed(list(self.RE.finditer(tail))):
Expand All @@ -92,7 +116,9 @@ def run(self, root: etree.Element) -> etree.Element | None:
# No abbreviations defined. Skip running processor.
return
# Build and compile regex
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b")
abbr_list = list(self.abbrs.keys())
abbr_list.sort(key=len, reverse=True)
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in abbr_list) })\\b")
# Step through tree and modify on matches
self.iter_element(root)

Expand Down Expand Up @@ -120,14 +146,18 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
if m:
abbr = m.group('abbr').strip()
title = m.group('title').strip()
self.abbrs[abbr] = title
if block[m.end():].strip():
# Add any content after match back to blocks as separate block
blocks.insert(0, block[m.end():].lstrip('\n'))
if block[:m.start()].strip():
# Add any content before match back to blocks as separate block
blocks.insert(0, block[:m.start()].rstrip('\n'))
return True
if title and abbr:
if title == "''" or title == '""':
self.abbrs.pop(abbr)
else:
self.abbrs[abbr] = title
if block[m.end():].strip():
# Add any content after match back to blocks as separate block
blocks.insert(0, block[m.end():].lstrip('\n'))
if block[:m.start()].strip():
# Add any content before match back to blocks as separate block
blocks.insert(0, block[:m.start()].rstrip('\n'))
return True
# No match. Restore block.
blocks.insert(0, block)
return False
Expand Down
136 changes: 136 additions & 0 deletions tests/test_syntax/extensions/test_abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,69 @@ def test_abbr_override(self):
)
)

def test_abbr_glossary(self):

glossary = {
"ABBR": "Abbreviation",
"abbr": "Abbreviation",
"HTML": "Hyper Text Markup Language",
"W3C": "World Wide Web Consortium"
}

self.assertMarkdownRenders(
self.dedent(
"""
ABBR
abbr
HTML
W3C
"""
),
self.dedent(
"""
<p><abbr title="Abbreviation">ABBR</abbr>
<abbr title="Abbreviation">abbr</abbr></p>
<p><abbr title="Hyper Text Markup Language">HTML</abbr>
<abbr title="World Wide Web Consortium">W3C</abbr></p>
"""
),
extensions=[AbbrExtension(glossary=glossary)]
)

def test_abbr_glossary_2(self):

glossary = {
"ABBR": "Abbreviation",
"abbr": "Abbreviation",
"HTML": "Hyper Text Markup Language",
"W3C": "World Wide Web Consortium"
}

glossary_2 = {
"ABBR": "New Abbreviation"
}

abbr_ext = AbbrExtension(glossary=glossary)
abbr_ext.load_glossary(glossary_2)

self.assertMarkdownRenders(
self.dedent(
"""
ABBR abbr HTML W3C
"""
),
self.dedent(
"""
<p><abbr title="New Abbreviation">ABBR</abbr> """
+ """<abbr title="Abbreviation">abbr</abbr> """
+ """<abbr title="Hyper Text Markup Language">HTML</abbr> """
+ """<abbr title="World Wide Web Consortium">W3C</abbr></p>
"""
),
extensions=[abbr_ext]
)

def test_abbr_nested(self):
self.assertMarkdownRenders(
self.dedent(
Expand Down Expand Up @@ -383,6 +446,79 @@ def test_abbr_with_attr_list(self):
extensions=['abbr', 'attr_list']
)

def test_abbr_superset_vs_subset(self):
self.assertMarkdownRenders(
self.dedent(
"""
abbr, SS, and abbr-SS should have different definitions.
*[abbr]: Abbreviation Definition
*[abbr-SS]: Abbreviation Superset Definition
*[SS]: Superset Definition
"""
),
self.dedent(
"""
<p><abbr title="Abbreviation Definition">abbr</abbr>, """
+ """<abbr title="Superset Definition">SS</abbr>, """
+ """and <abbr title="Abbreviation Superset Definition">abbr-SS</abbr> """
+ """should have different definitions.</p>
"""
)
)

def test_abbr_empty(self):
self.assertMarkdownRenders(
self.dedent(
"""
*[abbr]:
Abbreviation Definition
abbr
*[]: Empty
*[ ]: Empty
*[abbr]:
*[ABBR]:
Testing document text.
"""
),
self.dedent(
"""
<p><abbr title="Abbreviation Definition">abbr</abbr></p>\n"""
+ """<p>*[]: Empty</p>\n"""
+ """<p>*[ ]: Empty</p>\n"""
+ """<p>*[<abbr title="Abbreviation Definition">abbr</abbr>]:</p>\n"""
+ """<p>*[ABBR]:</p>\n"""
+ """<p>Testing document text.</p>
"""
)
)

def test_abbr_clear(self):
self.assertMarkdownRenders(
self.dedent(
"""
*[abbr]: Abbreviation Definition
*[ABBR]: Abbreviation Definition
abbr ABBR
*[abbr]: ""
*[ABBR]: ''
"""
),
self.dedent(
"""
<p>abbr ABBR</p>
"""
)
)

def test_abbr_reset(self):
ext = AbbrExtension()
md = Markdown(extensions=[ext])
Expand Down

0 comments on commit 33359fa

Please sign in to comment.