roedoejet · joanise · Nov 12, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/docs/package.md b/docs/package.md
@@ -32,10 +32,10 @@ Basic usage for the language-aware tokenizer:
 from g2p import make_tokenizer
 tokenizer = make_tokenizer("dan")
 for token in tokenizer.tokenize_text("Åh, hvordan har du det, Åbenrå?"):
-    if token["is_word"]:
-        word = token["text"]
+    if token.is_word
+        word = token.text
     else:
-        interword_punctuation_and_spaces = token["text"]
+        interword_punctuation_and_spaces = token.text
 ```
 
 Note that selecting the tokenizer language is important to make sure punctuation-like letters are handled correctly. For example `:` and `'` are punctuation in English but they will be part of the word tokens in Kanien'kéha (moh):

diff --git a/g2p/__init__.py b/g2p/__init__.py
@@ -16,10 +16,10 @@
     from g2p import make_tokenizer
     tokenizer = make_tokenizer(lang)
     for token in tokenizer.tokenize_text(input_text):
-        if token["is_word"]:
-            word = token["text"]
+        if token.is_word:
+            word = token.text
         else:
-            interword_punctuation_and_spaces = token["text"]
+            interword_punctuation_and_spaces = token.text
 
     from g2p import get_arpabet_langs
     LANGS, LANG_NAMES = get_arpabet_langs()
@@ -29,7 +29,7 @@
 from typing import Dict, Optional, Tuple, Union
 
 from g2p.exceptions import InvalidLanguageCode, NoPath
-from g2p.shared_types import BaseTokenizer, BaseTransducer
+from g2p.shared_types import BaseTokenizer, BaseTransducer, Token
 
 if sys.version_info < (3, 7):  # pragma: no cover
     sys.exit(
@@ -47,7 +47,7 @@ def make_g2p(  # noqa: C901
     *,
     tokenize: bool = True,
     custom_tokenizer: Optional[BaseTokenizer] = None,
-):
+) -> BaseTransducer:
     """Make a g2p Transducer for mapping text from in_lang to out_lang via the
     shortest path between them.
 
@@ -132,13 +132,13 @@ def make_g2p(  # noqa: C901
     return transducer
 
 
-def tokenize_and_map(tokenizer, transducer, input: str):
+def tokenize_and_map(tokenizer: BaseTokenizer, transducer: BaseTransducer, input: str):
     result = ""
     for token in tokenizer.tokenize_text(input):
-        if token["is_word"]:
-            result += transducer(token["text"]).output_string
+        if token.is_word:
+            result += transducer(token.text).output_string
         else:
-            result += token["text"]
+            result += token.text
     return result
 
 
@@ -213,7 +213,7 @@ def get_arpabet_langs():
         return _langs_cache, _lang_names_cache
 
 
-def make_tokenizer(in_lang=None, out_lang=None, tok_path=None):
+def make_tokenizer(in_lang=None, out_lang=None, tok_path=None) -> BaseTokenizer:
     """Make the tokenizer for input in language in_lang
 
     Logic used when only in_lang is provided:
@@ -234,3 +234,18 @@ def make_tokenizer(in_lang=None, out_lang=None, tok_path=None):
     from g2p.mappings.tokenizer import make_tokenizer as _make_tokenizer
 
     return _make_tokenizer(in_lang, out_lang, tok_path)
+
+
+# Declare what's actually part of g2p's programmatic API.
+# Please don't import anything else from g2p directly.
+__all__ = [
+    "BaseTokenizer",
+    "BaseTransducer",
+    "InvalidLanguageCode",
+    "NoPath",
+    "Token",
+    "get_arpabet_langs",
+    "make_g2p",
+    "make_tokenizer",
+    "tokenize_and_map",
+]
diff --git a/g2p/api_v2.py b/g2p/api_v2.py
@@ -300,7 +300,7 @@ def convert_one_writing_or_phonetic_system_to_another(  # noqa: C901
             tokenizer = g2p.make_tokenizer(in_lang)
             tokens = tokenizer.tokenize_text(request.text)
         else:
-            tokens = [{"text": request.text, "is_word": True}]
+            tokens = [g2p.Token(request.text, is_word=True)]
     except NoPath:
         raise HTTPException(
             status_code=400, detail=f"No path from {in_lang} to {out_lang}"
@@ -314,16 +314,16 @@ def convert_one_writing_or_phonetic_system_to_another(  # noqa: C901
     segments: List[Segment] = []
     for token in tokens:
         conversions: List[Conversion] = []
-        if not token["is_word"]:  # non-word, has no in_lang/out_lang
-            tg = TransductionGraph(token["text"])
+        if not token.is_word:  # non-word, has no in_lang/out_lang
+            tg = TransductionGraph(token.text)
             conv = Conversion(substring_alignments=tg.substring_alignments())
             if request.indices:
                 conv.alignments = tg.alignments()
                 conv.input_nodes = list(tg.input_string)
                 conv.output_nodes = list(tg.output_string)
             conversions.append(conv)
         else:
-            tg = transducer(token["text"])
+            tg = transducer(token.text)
             if request.compose_from:
                 composed_tiers: List[TransductionGraph] = []
                 for tr, tier in zip(transducer.transducers, tg.tiers):

diff --git a/g2p/mappings/tokenizer.py b/g2p/mappings/tokenizer.py
@@ -11,10 +11,10 @@
 
 from g2p.exceptions import MappingMissing
 from g2p.log import LOGGER
-from g2p.mappings import Mapping
+from g2p.mappings import Mapping, utils
 from g2p.mappings.langs import LANGS_NETWORK
-from g2p.mappings.utils import get_unicode_category, is_ipa, merge_if_same_label
-from g2p.shared_types import BaseTokenizer
+from g2p.mappings.utils import is_ipa
+from g2p.shared_types import BaseTokenizer, Token
 
 
 class Tokenizer(BaseTokenizer):
@@ -42,23 +42,18 @@ def is_word_character(self, c):
         if self.delim and c == self.delim:
             return True
         assert len(c) <= 1
-        if get_unicode_category(c) in ["letter", "number", "diacritic"]:
+        if utils.get_unicode_category(c) in ["letter", "number", "diacritic"]:
             return True
         return False
 
-    def tokenize_text(self, text):
+    def tokenize_text(self, text: str) -> List[Token]:
         matches = self.tokenize_aux(text)
-        units = [{"text": m, "is_word": self.is_word_character(m)} for m in matches]
+        units = [Token(m, self.is_word_character(m)) for m in matches]
         if self.dot_is_letter:
             for i, unit in enumerate(units):
-                if (
-                    unit["text"] == "."
-                    and i + 1 < len(units)
-                    and units[i + 1]["is_word"]
-                ):
-                    unit["is_word"] = True
-        units = merge_if_same_label(units, "text", "is_word")
-        return units
+                if unit.text == "." and i + 1 < len(units) and units[i + 1].is_word:
+                    unit.is_word = True
+        return utils.merge_same_type_tokens(units)
 
 
 class SpecializedTokenizer(Tokenizer):
@@ -98,6 +93,51 @@ def tokenize_aux(self, text):
         return self.regex.findall(text)
 
 
+class LexiconTokenizer(Tokenizer):
+    """Lexicon-based tokenizer will consider any entry in the lexicon a token,
+    even if it contains punctuation characters. For text not in the lexicon,
+    falls back to the default tokenization.
+    """
+
+    def __init__(self, mapping: Mapping):
+        super().__init__()
+        self.mapping = mapping
+        self.lang = mapping.language_name
+
+    def _recursive_helper(self, tokens: list, output_tokens: list):
+        """Emit the longest prefix found in the lexicon, if any, as a token.
+        If None, emit the first unit as a token.
+        Recursively process the rest of the units.
+        """
+        if not tokens:
+            return
+        if len(tokens) == 1:
+            output_tokens.append(tokens[0])
+            return
+        for i in range(len(tokens), 0, -1):
+            candidate = "".join([u.text for u in tokens[:i]])
+            if utils.find_alignment(self.mapping.alignments, candidate.lower()):
+                output_tokens.append(Token(candidate, True))
+                return self._recursive_helper(tokens[i:], output_tokens)
+        # No prefix found, emit the first unit as a token
+        output_tokens.append(tokens[0])
+        self._recursive_helper(tokens[1:], output_tokens)
+
+    def tokenize_text(self, text: str) -> List[Token]:
+        blocks = re.split(r"(\s+)", text)
+        output_tokens = []
+        for i, block in enumerate(blocks):
+            if i % 2 == 1 and block:
+                output_tokens.append(Token(block, False))
+            else:
+                default_tokens = super().tokenize_text(block)
+                # Split non-word tokens into smaller parts for lexicon lookup
+                candidate_tokens = utils.split_non_word_tokens(default_tokens)
+                self._recursive_helper(candidate_tokens, output_tokens)
+
+        return utils.merge_non_word_tokens(output_tokens)
+
+
 class MultiHopTokenizer(SpecializedTokenizer):
     def __init__(self, mappings: List[Mapping]):
         self.delim = ""
@@ -202,7 +242,10 @@ def make_tokenizer(  # noqa C901
                 # Build a one-hop tokenizer
                 try:
                     mapping = Mapping.find_mapping(in_lang=in_lang, out_lang=out_lang)
-                    self.tokenizers[tokenizer_key] = SpecializedTokenizer(mapping)
+                    if mapping.type == utils.MAPPING_TYPE.lexicon:
+                        self.tokenizers[tokenizer_key] = LexiconTokenizer(mapping)
+                    else:
+                        self.tokenizers[tokenizer_key] = SpecializedTokenizer(mapping)
                 except MappingMissing:
                     self.tokenizers[tokenizer_key] = self.tokenizers[None]
                     LOGGER.warning(

diff --git a/g2p/mappings/utils.py b/g2p/mappings/utils.py
@@ -10,7 +10,6 @@
 import unicodedata as ud
 from bisect import bisect_left
 from collections import defaultdict
-from copy import deepcopy
 from enum import Enum
 from pathlib import Path
 from typing import (
@@ -43,6 +42,7 @@
 from g2p import exceptions
 from g2p.log import LOGGER
 from g2p.mappings import langs
+from g2p.shared_types import Token
 
 GEN_DIR = os.path.join(os.path.dirname(langs.__file__), "generated")
 GEN_CONFIG = os.path.join(GEN_DIR, "config-g2p.yaml")
@@ -151,7 +151,7 @@ def normalize(inp: str, norm_form: Union[str, None]):
     if norm_form is None or norm_form == "none":
         return unicode_escape(inp)
     if norm_form not in ["NFC", "NFD", "NFKC", "NFKD"]:
-        raise exceptions.InvalidNormalization(normalize)
+        raise exceptions.InvalidNormalization(norm_form)
     # Sadly mypy doesn't do narrowing to literals properly
     norm_form = cast(Literal["NFC", "NFD", "NFKC", "NFKD"], norm_form)
     normalized = ud.normalize(norm_form, unicode_escape(inp))
@@ -178,8 +178,8 @@ def compose_indices(
     """Compose indices1 + indices2 into direct arcs from the inputs of indices1
     to the outputs of indices 2.
 
-    E.g., [(0,1), (1,4)] composed with [(0,0), (1,2), (1,3), (4,2)] is
-    [(0,2), (0,3), (1,2)]
+    >>> compose_indices([(0,1), (1,4)], [(0,0), (1,2), (1,3), (4,2)])
+    [(0, 2), (0, 3), (1, 2)]
     """
     # for O(1) lookup of arcs leaving indices2
     indices2_as_dict = defaultdict(dict)  # type: ignore
@@ -239,7 +239,7 @@ def normalize_with_indices(
         return normalize_to_NFD_with_indices(inp, norm_form)
     if norm_form in ("none", None):
         return inp, [(i, i) for i in range(len(inp))]
-    raise exceptions.InvalidNormalization(normalize)
+    raise exceptions.InvalidNormalization(norm_form)
 
 
 def unicode_escape(text):
@@ -596,22 +596,76 @@ def ignore_aliases(self, *_args):
         return True
 
 
-def merge_if_same_label(lst_of_dicts, text_key, label_key):
-    results = []
-    current_item = None
-    for dct in lst_of_dicts:
-        if label_key not in dct:
-            dct[label_key] = None
-        if not current_item:
-            current_item = deepcopy(dct)
-        elif dct[label_key] == current_item[label_key]:
-            current_item[text_key] += dct[text_key]
+def merge_same_type_tokens(tokens: List[Token]) -> List[Token]:
+    """Merge tokens that have the same type.  Destroys tokens in the process.
+
+    >>> merge_same_type_tokens([Token("test", True), Token("b", True), Token(":", False), Token(",", False)])
+    [Token(text='testb', is_word=True), Token(text=':,', is_word=False)]
+    >>> merge_same_type_tokens([])
+    []
+    """
+    if not tokens:
+        return []
+    merged_tokens = [tokens[0]]
+    for token in tokens[1:]:
+        if token.is_word == merged_tokens[-1].is_word:
+            merged_tokens[-1].text += token.text
+        else:
+            merged_tokens.append(token)
+    return merged_tokens
+
+
+def split_non_word_tokens(tokens: List[Token]) -> List[Token]:
+    """Split non-word units into characters. Reuses the word tokens.
+
+    Generates a maximum of 5 units per non-word token: if the input token is
+    more than 5 non-word characters, the output will be the first two
+    individually, the middle as a block, and the last two individually, because
+    lexicon-based tokenization does not need more granularity than that.
+    This prevents degenerate input like a large number of consecutive punctuation
+    marks from taking quadratic time in lexicon-based tokenization.
+
+    >>> split_non_word_tokens([Token("test", True), Token(":,- ", False), Token("", False)])
+    [Token(text='test', is_word=True), Token(text=':', is_word=False), Token(text=',', is_word=False), Token(text='-', is_word=False), Token(text=' ', is_word=False)]
+    >>> split_non_word_tokens([])
+    []
+    >>> split_non_word_tokens([Token(".,.,.,.", False)])
+    [Token(text='.', is_word=False), Token(text=',', is_word=False), Token(text='.,.', is_word=False), Token(text=',', is_word=False), Token(text='.', is_word=False)]
+    """
+    new_tokens = []
+    for token in tokens:
+        if not token.is_word:
+            text = token.text
+            if len(text) > 5:
+                new_tokens.append(Token(text[0], False))
+                new_tokens.append(Token(text[1], False))
+                new_tokens.append(Token(text[2:-2], False))
+                new_tokens.append(Token(text[-2], False))
+                new_tokens.append(Token(text[-1], False))
+            else:
+                new_tokens.extend([Token(char, False) for char in text])
+        else:
+            new_tokens.append(token)
+    return new_tokens
+
+
+def merge_non_word_tokens(tokens: List[Token]) -> List[Token]:
+    """Merge consecutive non-word units into a single token. Destroys tokens in the process.
+
+    >>> merge_non_word_tokens([Token("test", True), Token(":", False), Token(",", False)])
+    [Token(text='test', is_word=True), Token(text=':,', is_word=False)]
+    >>> merge_non_word_tokens([])
+    []
+    """
+    if not tokens:
+        return tokens
+    merged_tokens = [tokens[0]]
+    for token in tokens[1:]:
+        if not token.is_word and not merged_tokens[-1].is_word:
+            merged_tokens[-1].text += token.text
         else:
-            results.append(current_item)
-            current_item = deepcopy(dct)
-    if current_item:
-        results.append(current_item)
-    return results
+            merged_tokens.append(token)
+    return merged_tokens
 
 
 CATEGORIES = {