Skip to content

Commit

Permalink
test: better unit testing for mappings.utils
Browse files Browse the repository at this point in the history
  • Loading branch information
joanise committed Nov 12, 2024
1 parent d662622 commit 0b2c83c
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 10 deletions.
32 changes: 25 additions & 7 deletions g2p/mappings/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def normalize(inp: str, norm_form: Union[str, None]):
if norm_form is None or norm_form == "none":
return unicode_escape(inp)
if norm_form not in ["NFC", "NFD", "NFKC", "NFKD"]:
raise exceptions.InvalidNormalization(normalize)
raise exceptions.InvalidNormalization(norm_form)
# Sadly mypy doesn't do narrowing to literals properly
norm_form = cast(Literal["NFC", "NFD", "NFKC", "NFKD"], norm_form)
normalized = ud.normalize(norm_form, unicode_escape(inp))
Expand All @@ -177,8 +177,8 @@ def compose_indices(
"""Compose indices1 + indices2 into direct arcs from the inputs of indices1
to the outputs of indices 2.
E.g., [(0,1), (1,4)] composed with [(0,0), (1,2), (1,3), (4,2)] is
[(0,2), (0,3), (1,2)]
>>> compose_indices([(0,1), (1,4)], [(0,0), (1,2), (1,3), (4,2)])
[(0, 2), (0, 3), (1, 2)]
"""
# for O(1) lookup of arcs leaving indices2
indices2_as_dict = defaultdict(dict) # type: ignore
Expand Down Expand Up @@ -238,7 +238,7 @@ def normalize_with_indices(
return normalize_to_NFD_with_indices(inp, norm_form)
if norm_form in ("none", None):
return inp, [(i, i) for i in range(len(inp))]
raise exceptions.InvalidNormalization(normalize)
raise exceptions.InvalidNormalization(norm_form)


def unicode_escape(text):
Expand Down Expand Up @@ -597,7 +597,13 @@ def ignore_aliases(self, *_args):

def merge_same_type_tokens(tokens: list) -> list:
"""Merge tokens that have the same type. Destroys tokens in the process.
Tokens are represented as dicts {"text": str, "is_word": bool}."""
Tokens are represented as dicts {"text": str, "is_word": bool}.
>>> merge_same_type_tokens([{"text": "test", "is_word": True}, {"text": "b", "is_word": True}, {"text": ":", "is_word": False}, {"text": ",", "is_word": False}])
[{'text': 'testb', 'is_word': True}, {'text': ':,', 'is_word': False}]
>>> merge_same_type_tokens([])
[]
"""
if not tokens:
return []
merged_tokens = [tokens[0]]
Expand All @@ -610,7 +616,13 @@ def merge_same_type_tokens(tokens: list) -> list:


def split_non_word_tokens(tokens: list) -> list:
"""Split non-word units into characters. Destroys tokens in the process."""
"""Split non-word units into characters. Destroys tokens in the process.
>>> split_non_word_tokens([{"text": "test", "is_word": True}, {"text": ":,- ", "is_word": False}, {"text": "", "is_word": False}])
[{'text': 'test', 'is_word': True}, {'text': ':', 'is_word': False}, {'text': ',', 'is_word': False}, {'text': '-', 'is_word': False}, {'text': ' ', 'is_word': False}]
>>> split_non_word_tokens([])
[]
"""
new_tokens = []
for token in tokens:
if not token["is_word"]:
Expand All @@ -623,7 +635,13 @@ def split_non_word_tokens(tokens: list) -> list:


def merge_non_word_tokens(tokens: list) -> list:
"""Merge consecutive non-word units into a single token. Destroys tokens in the process."""
"""Merge consecutive non-word units into a single token. Destroys tokens in the process.
>>> merge_non_word_tokens([{"text": "test", "is_word": True}, {"text": ":", "is_word": False}, {"text": ",", "is_word": False}])
[{'text': 'test', 'is_word': True}, {'text': ':,', 'is_word': False}]
>>> merge_non_word_tokens([])
[]
"""
if not tokens:
return tokens
merged_tokens = [tokens[0]]
Expand Down
10 changes: 7 additions & 3 deletions g2p/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
from pep440 import is_canonical

import g2p
import g2p.exceptions
from g2p import get_arpabet_langs
from g2p._version import VERSION, version_tuple
from g2p.exceptions import IncorrectFileType, RecursionError
from g2p.log import LOGGER
from g2p.mappings import Mapping, utils
from g2p.mappings.utils import RULE_ORDERING_ENUM, Rule
Expand Down Expand Up @@ -60,7 +60,7 @@ def test_abb_expand(self):
) # shouldn't allow self-referential abbreviations
expanded_plain = utils.expand_abbreviations("test", test_dict)
expanded_bad_plain = utils.expand_abbreviations("test", bad_dict)
with self.assertRaises(RecursionError):
with self.assertRaises(g2p.exceptions.RecursionError):
utils.expand_abbreviations("HIGH_VOWELS", bad_dict)
expanded_non_recursive = utils.expand_abbreviations("HIGH_VOWELS", test_dict)
expanded_recursive = utils.expand_abbreviations("VOWELS", test_dict)
Expand Down Expand Up @@ -156,7 +156,7 @@ def test_escape_special(self):
)

def test_load_abbs(self):
with self.assertRaises(IncorrectFileType):
with self.assertRaises(g2p.exceptions.IncorrectFileType):
utils.load_abbreviations_from_file(
os.path.join(PUBLIC_DIR, "mappings", "abbreviations.json")
)
Expand Down Expand Up @@ -212,6 +212,10 @@ def test_generated_mapping(self):
test_config_added.display_name, "test custom to test-out custom"
)

def test_bad_normalization(self):
with self.assertRaises(g2p.exceptions.InvalidNormalization):
utils.normalize_with_indices("test", "bad")

def test_normalize_to_NFD_with_indices(self):
# Usefull site to get combining character code points:
# http://www.alanwood.net/unicode/combining_diacritical_marks.html
Expand Down

0 comments on commit 0b2c83c

Please sign in to comment.