From 24cb99284034b746bfea30470f296252f36b65a8 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 17 Jan 2023 18:22:46 +0100 Subject: [PATCH] adjust affix search, add rules and prefixes for DE & RU (#3) --- simplemma/rules.py | 136 ++++++++++++++++++++++++++++++++++------ simplemma/simplemma.py | 43 ++++++++++--- tests/test_rules.py | 1 + tests/test_simplemma.py | 17 +++++ 4 files changed, 170 insertions(+), 27 deletions(-) diff --git a/simplemma/rules.py b/simplemma/rules.py index dcf4977..09d6c29 100644 --- a/simplemma/rules.py +++ b/simplemma/rules.py @@ -5,7 +5,30 @@ from typing import Optional -RULES_LANGS = {"de", "en", "fi", "nl"} +RULES_LANGS = {"de", "en", "fi", "nl", "pl", "ru"} + +# VOWELS = {"a", "e", "i", "o", "u", "y"} + + +def apply_rules( + token: str, langcode: Optional[str], greedy: bool = False +) -> Optional[str]: + "Apply pre-defined rules for certain languages." + candidate = None + if langcode == "de": + candidate = apply_de(token, greedy) + elif langcode == "en": + candidate = apply_en(token) + elif langcode == "fi": + candidate = apply_fi(token) + elif langcode == "nl": + candidate = apply_nl(token) + elif langcode == "pl": + candidate = apply_pl(token) + elif langcode == "ru": + candidate = apply_ru(token) + return candidate + NOUN_ENDINGS_DE = re.compile( r"(?:bold|[^kl]ling|ment)(e?[ns]?)?$|" @@ -31,25 +54,62 @@ ENDING_CHARS_ADJ_DE = ENDING_CHARS_NN_DE.union({"d", "t"}) ENDING_DE = re.compile(r"(?:e|em|en|er|es)$") -# VOWELS = {"a", "e", "i", "o", "u", "y"} - - -def apply_rules( - token: str, langcode: Optional[str], greedy: bool = False -) -> Optional[str]: - "Apply pre-defined rules for certain languages." - candidate = None - if langcode == "de": - candidate = apply_de(token, greedy) - elif langcode == "en": - candidate = apply_en(token) - elif langcode == "fi": - candidate = apply_fi(token) - elif langcode == "nl": - candidate = apply_nl(token) - elif langcode == "pl": - candidate = apply_pl(token) - return candidate +# 2-letter prefixes are theoretically already accounted for by the current AFFIXLEN parameter +GERMAN_PREFIXES = { + "ab", + "an", + "auf", + "aus", + "be", + "bei", + "da", + "dar", + "durch", + "ein", + "ent", + "er", + "gegen", + "her", + "heran", + "herab", + "herauf", + "heraus", + "herein", + "herum", + "herunter", + "hervor", + "hin", + "hinauf", + "hinaus", + "hinein", + "hinter", + "hinunter", + "hinweg", + "hinzu", + "los", + "miss", + "mit", + "nach", + "neben", + "ran", + "raus", + "rein", + "rum", + "runter", + "über", + "unter", + "ver", + "vor", + "voran", + "voraus", + "vorbei", + "vorher", + "vorüber", + "weg", + "weiter", + "wieder", + "zer", +} def apply_de(token: str, greedy: bool = False) -> Optional[str]: @@ -405,3 +465,39 @@ def apply_pl(token: str) -> Optional[str]: if token.endswith(ending): return token[: -len(ending)] + base return None + + +RUSSIAN_PREFIXES = {"за", "много", "недо", "пере", "пред", "само"} + +RUSSIAN_ENDINGS = { + # -ость + "ости": "ость", + "остью": "ость", + "остию": "ость", + "остьи": "ость", + "остии": "ость", + "остьхъ": "ость", + "остьма": "ость", + "остьмъ": "ость", + "остиѭ": "ость", + "остьми": "ость", + # -ство + "ства": "ство", + "ств": "ство", + "ству": "ство", + "ствам": "ство", + "ством": "ство", + "ствами": "ство", + "стве": "ство", + "ствах": "ство", +} + + +def apply_ru(token: str) -> Optional[str]: + "Apply pre-defined rules for Russian." + if len(token) < 10 or token[0].isupper() or "-" in token: + return None + for ending, base in RUSSIAN_ENDINGS.items(): + if token.endswith(ending): + return token[: -len(ending)] + base + return None diff --git a/simplemma/simplemma.py b/simplemma/simplemma.py index d329753..d080089 100644 --- a/simplemma/simplemma.py +++ b/simplemma/simplemma.py @@ -10,7 +10,7 @@ from typing import Any, Dict, List, Iterator, Optional, Tuple, Union try: - from .rules import apply_rules, RULES_LANGS + from .rules import apply_rules, GERMAN_PREFIXES, RULES_LANGS, RUSSIAN_PREFIXES from .tokenizer import simple_tokenizer # local error, also ModuleNotFoundError for Python >= 3.6 except ImportError: # pragma: no cover @@ -98,8 +98,11 @@ } BETTER_LOWER = {"bg", "es", "hy", "lt", "lv", "pt", "sk"} BUFFER_HACK = {"bg", "es", "et", "fi", "fr", "it", "lt", "pl", "sk"} # "da", "nl" + +# TODO: This custom behavior has to be simplified before it becomes unmaintainable LONGER_AFFIXES = {"et", "fi", "hu", "lt"} SHORTER_GREEDY = {"bg", "et", "fi"} +AFFIX_LANGS = {"bg", "et", "fi", "hu", "lt", "lv", "nb", "pl", "ru", "sk", "tr"} HYPHEN_REGEX = re.compile(r"([_-])") HYPHENS = {"-", "_"} @@ -174,7 +177,7 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]: if rule == columns[0]: continue elif rule is not None and rule != columns[1]: - print(columns[1], columns[0], apply_rules(columns[1], langcode)) + print(columns[1], columns[0], rule) # process if columns[1] in mydict and mydict[columns[1]] != columns[0]: # prevent mistakes and noise coming from the lists @@ -417,6 +420,29 @@ def _affix_search( return candidate +def _prefix_search(token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]: + # load prefixes + if lang == "de": + preflist = GERMAN_PREFIXES + elif lang == "ru": + preflist = RUSSIAN_PREFIXES + else: + return None + # apply + prefix = None + for p in preflist: + if token.startswith(p): + prefix = p + break + # decompose according to predefined prefix + if prefix is not None: + subword = _simple_search(token[len(prefix) :], datadict) + if subword is not None: + if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu": + return prefix + subword.lower() + return None + + def _suffix_search(token: str, datadict: Dict[str, str]) -> Optional[str]: lastcount = 0 for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1): @@ -451,13 +477,16 @@ def _return_lemma( if newcandidate is not None: candidate = newcandidate # stop here in some cases - if not greedy: - return candidate + # if not greedy: + # return candidate limit = 6 if lang in SHORTER_GREEDY else 8 if len(token) <= limit: return candidate - # greedy subword decomposition: suffix/affix search + # subword decomposition: predefined prefixes (absent from vocabulary if they are not words) if candidate is None: + candidate = _prefix_search(token, lang, datadict) # type: ignore[arg-type] + # unsupervised suffix/affix search: not productive for all languages + if candidate is None and (greedy or lang in AFFIX_LANGS): # define parameters maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN # greedier subword decomposition: suffix search with character in between @@ -465,8 +494,8 @@ def _return_lemma( candidate = _affix_search(token, datadict, maxlen) or _suffix_search( token, datadict ) - # try further hops, not always a good idea - else: + # greedy mode: try further hops, not always a good idea + if candidate is not None and greedy: candidate = _greedy_search(candidate, datadict) return candidate diff --git a/tests/test_rules.py b/tests/test_rules.py index 2e3eb9c..ec6848e 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -161,3 +161,4 @@ def test_apply_rules(): assert apply_rules("brieven", "nl") == "brief" assert apply_rules("liikenaisessa", "fi") == "liikenainen" assert apply_rules("pracowaliście", "pl") == "pracować" + assert apply_rules("безгра́мотностью", "ru") == "безгра́мотность" diff --git a/tests/test_simplemma.py b/tests/test_simplemma.py index 16cc01e..3f8f302 100644 --- a/tests/test_simplemma.py +++ b/tests/test_simplemma.py @@ -171,6 +171,17 @@ def test_logic(): == "getestet" ) + # prefixes + mydata = simplemma.simplemma._load_data(("de", "ru")) + assert ( + simplemma.simplemma._prefix_search("zerlemmatisiertes", "de", mydata[0].dict) + == "zerlemmatisiert" + ) + assert ( + simplemma.simplemma._prefix_search("зафиксированные", "ru", mydata[1].dict) + == "зафиксированный" + ) + def test_convenience(): """Test convenience functions.""" @@ -355,6 +366,12 @@ def test_subwords(): # assert lemmatize('Spargelstangen', lang='de', greedy=True) == 'Spargelstange' # assert lemmatize("Bandmitgliedern", lang="de", greedy=True) == "Bandmitglied" + # prefixes + assert lemmatize("lemmatisiertes", lang="de") == "lemmatisiert" + assert lemmatize("zerlemmatisiertes", lang="de") == "zerlemmatisiert" + assert lemmatize("фиксированные", lang="ru") == "фиксированный" + assert lemmatize("зафиксированные", lang="ru") == "зафиксированный" + def test_tokenizer(): # tokenization and chaining