test: better unit testing for mappings.utils

roedoejet · Nov 12, 2024 · 0b2c83c · 0b2c83c
1 parent d662622
commit 0b2c83c
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 10 deletions.
diff --git a/g2p/mappings/utils.py b/g2p/mappings/utils.py
@@ -150,7 +150,7 @@ def normalize(inp: str, norm_form: Union[str, None]):
     if norm_form is None or norm_form == "none":
         return unicode_escape(inp)
     if norm_form not in ["NFC", "NFD", "NFKC", "NFKD"]:
-        raise exceptions.InvalidNormalization(normalize)
+        raise exceptions.InvalidNormalization(norm_form)
     # Sadly mypy doesn't do narrowing to literals properly
     norm_form = cast(Literal["NFC", "NFD", "NFKC", "NFKD"], norm_form)
     normalized = ud.normalize(norm_form, unicode_escape(inp))
@@ -177,8 +177,8 @@ def compose_indices(
     """Compose indices1 + indices2 into direct arcs from the inputs of indices1
     to the outputs of indices 2.
 
-    E.g., [(0,1), (1,4)] composed with [(0,0), (1,2), (1,3), (4,2)] is
-    [(0,2), (0,3), (1,2)]
+    >>> compose_indices([(0,1), (1,4)], [(0,0), (1,2), (1,3), (4,2)])
+    [(0, 2), (0, 3), (1, 2)]
     """
     # for O(1) lookup of arcs leaving indices2
     indices2_as_dict = defaultdict(dict)  # type: ignore
@@ -238,7 +238,7 @@ def normalize_with_indices(
         return normalize_to_NFD_with_indices(inp, norm_form)
     if norm_form in ("none", None):
         return inp, [(i, i) for i in range(len(inp))]
-    raise exceptions.InvalidNormalization(normalize)
+    raise exceptions.InvalidNormalization(norm_form)
 
 
 def unicode_escape(text):
@@ -597,7 +597,13 @@ def ignore_aliases(self, *_args):
 
 def merge_same_type_tokens(tokens: list) -> list:
     """Merge tokens that have the same type.  Destroys tokens in the process.
-    Tokens are represented as dicts {"text": str, "is_word": bool}."""
+    Tokens are represented as dicts {"text": str, "is_word": bool}.
+
+    >>> merge_same_type_tokens([{"text": "test", "is_word": True}, {"text": "b", "is_word": True}, {"text": ":", "is_word": False}, {"text": ",", "is_word": False}])
+    [{'text': 'testb', 'is_word': True}, {'text': ':,', 'is_word': False}]
+    >>> merge_same_type_tokens([])
+    []
+    """
     if not tokens:
         return []
     merged_tokens = [tokens[0]]
@@ -610,7 +616,13 @@ def merge_same_type_tokens(tokens: list) -> list:
 
 
 def split_non_word_tokens(tokens: list) -> list:
-    """Split non-word units into characters. Destroys tokens in the process."""
+    """Split non-word units into characters. Destroys tokens in the process.
+
+    >>> split_non_word_tokens([{"text": "test", "is_word": True}, {"text": ":,- ", "is_word": False}, {"text": "", "is_word": False}])
+    [{'text': 'test', 'is_word': True}, {'text': ':', 'is_word': False}, {'text': ',', 'is_word': False}, {'text': '-', 'is_word': False}, {'text': ' ', 'is_word': False}]
+    >>> split_non_word_tokens([])
+    []
+    """
     new_tokens = []
     for token in tokens:
         if not token["is_word"]:
@@ -623,7 +635,13 @@ def split_non_word_tokens(tokens: list) -> list:
 
 
 def merge_non_word_tokens(tokens: list) -> list:
-    """Merge consecutive non-word units into a single token. Destroys tokens in the process."""
+    """Merge consecutive non-word units into a single token. Destroys tokens in the process.
+
+    >>> merge_non_word_tokens([{"text": "test", "is_word": True}, {"text": ":", "is_word": False}, {"text": ",", "is_word": False}])
+    [{'text': 'test', 'is_word': True}, {'text': ':,', 'is_word': False}]
+    >>> merge_non_word_tokens([])
+    []
+    """
     if not tokens:
         return tokens
     merged_tokens = [tokens[0]]

diff --git a/g2p/tests/test_utils.py b/g2p/tests/test_utils.py
@@ -14,9 +14,9 @@
 from pep440 import is_canonical
 
 import g2p
+import g2p.exceptions
 from g2p import get_arpabet_langs
 from g2p._version import VERSION, version_tuple
-from g2p.exceptions import IncorrectFileType, RecursionError
 from g2p.log import LOGGER
 from g2p.mappings import Mapping, utils
 from g2p.mappings.utils import RULE_ORDERING_ENUM, Rule
@@ -60,7 +60,7 @@ def test_abb_expand(self):
         )  # shouldn't allow self-referential abbreviations
         expanded_plain = utils.expand_abbreviations("test", test_dict)
         expanded_bad_plain = utils.expand_abbreviations("test", bad_dict)
-        with self.assertRaises(RecursionError):
+        with self.assertRaises(g2p.exceptions.RecursionError):
             utils.expand_abbreviations("HIGH_VOWELS", bad_dict)
         expanded_non_recursive = utils.expand_abbreviations("HIGH_VOWELS", test_dict)
         expanded_recursive = utils.expand_abbreviations("VOWELS", test_dict)
@@ -156,7 +156,7 @@ def test_escape_special(self):
         )
 
     def test_load_abbs(self):
-        with self.assertRaises(IncorrectFileType):
+        with self.assertRaises(g2p.exceptions.IncorrectFileType):
             utils.load_abbreviations_from_file(
                 os.path.join(PUBLIC_DIR, "mappings", "abbreviations.json")
             )
@@ -212,6 +212,10 @@ def test_generated_mapping(self):
             test_config_added.display_name, "test custom to test-out custom"
         )
 
+    def test_bad_normalization(self):
+        with self.assertRaises(g2p.exceptions.InvalidNormalization):
+            utils.normalize_with_indices("test", "bad")
+
     def test_normalize_to_NFD_with_indices(self):
         # Usefull site to get combining character code points:
         # http://www.alanwood.net/unicode/combining_diacritical_marks.html