From 24cb99284034b746bfea30470f296252f36b65a8 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Tue, 17 Jan 2023 18:22:46 +0100
Subject: [PATCH] adjust affix search, add rules and prefixes for DE & RU (#3)

---
 simplemma/rules.py      | 136 ++++++++++++++++++++++++++++++++++------
 simplemma/simplemma.py  |  43 ++++++++++---
 tests/test_rules.py     |   1 +
 tests/test_simplemma.py |  17 +++++
 4 files changed, 170 insertions(+), 27 deletions(-)

diff --git a/simplemma/rules.py b/simplemma/rules.py
index dcf4977..09d6c29 100644
--- a/simplemma/rules.py
+++ b/simplemma/rules.py
@@ -5,7 +5,30 @@
 from typing import Optional
 
 
-RULES_LANGS = {"de", "en", "fi", "nl"}
+RULES_LANGS = {"de", "en", "fi", "nl", "pl", "ru"}
+
+# VOWELS = {"a", "e", "i", "o", "u", "y"}
+
+
+def apply_rules(
+    token: str, langcode: Optional[str], greedy: bool = False
+) -> Optional[str]:
+    "Apply pre-defined rules for certain languages."
+    candidate = None
+    if langcode == "de":
+        candidate = apply_de(token, greedy)
+    elif langcode == "en":
+        candidate = apply_en(token)
+    elif langcode == "fi":
+        candidate = apply_fi(token)
+    elif langcode == "nl":
+        candidate = apply_nl(token)
+    elif langcode == "pl":
+        candidate = apply_pl(token)
+    elif langcode == "ru":
+        candidate = apply_ru(token)
+    return candidate
+
 
 NOUN_ENDINGS_DE = re.compile(
     r"(?:bold|[^kl]ling|ment)(e?[ns]?)?$|"
@@ -31,25 +54,62 @@
 ENDING_CHARS_ADJ_DE = ENDING_CHARS_NN_DE.union({"d", "t"})
 ENDING_DE = re.compile(r"(?:e|em|en|er|es)$")
 
-# VOWELS = {"a", "e", "i", "o", "u", "y"}
-
-
-def apply_rules(
-    token: str, langcode: Optional[str], greedy: bool = False
-) -> Optional[str]:
-    "Apply pre-defined rules for certain languages."
-    candidate = None
-    if langcode == "de":
-        candidate = apply_de(token, greedy)
-    elif langcode == "en":
-        candidate = apply_en(token)
-    elif langcode == "fi":
-        candidate = apply_fi(token)
-    elif langcode == "nl":
-        candidate = apply_nl(token)
-    elif langcode == "pl":
-        candidate = apply_pl(token)
-    return candidate
+# 2-letter prefixes are theoretically already accounted for by the current AFFIXLEN parameter
+GERMAN_PREFIXES = {
+    "ab",
+    "an",
+    "auf",
+    "aus",
+    "be",
+    "bei",
+    "da",
+    "dar",
+    "durch",
+    "ein",
+    "ent",
+    "er",
+    "gegen",
+    "her",
+    "heran",
+    "herab",
+    "herauf",
+    "heraus",
+    "herein",
+    "herum",
+    "herunter",
+    "hervor",
+    "hin",
+    "hinauf",
+    "hinaus",
+    "hinein",
+    "hinter",
+    "hinunter",
+    "hinweg",
+    "hinzu",
+    "los",
+    "miss",
+    "mit",
+    "nach",
+    "neben",
+    "ran",
+    "raus",
+    "rein",
+    "rum",
+    "runter",
+    "über",
+    "unter",
+    "ver",
+    "vor",
+    "voran",
+    "voraus",
+    "vorbei",
+    "vorher",
+    "vorüber",
+    "weg",
+    "weiter",
+    "wieder",
+    "zer",
+}
 
 
 def apply_de(token: str, greedy: bool = False) -> Optional[str]:
@@ -405,3 +465,39 @@ def apply_pl(token: str) -> Optional[str]:
         if token.endswith(ending):
             return token[: -len(ending)] + base
     return None
+
+
+RUSSIAN_PREFIXES = {"за", "много", "недо", "пере", "пред", "само"}
+
+RUSSIAN_ENDINGS = {
+    # -ость
+    "ости": "ость",
+    "остью": "ость",
+    "остию": "ость",
+    "остьи": "ость",
+    "остии": "ость",
+    "остьхъ": "ость",
+    "остьма": "ость",
+    "остьмъ": "ость",
+    "остиѭ": "ость",
+    "остьми": "ость",
+    # -ство
+    "ства": "ство",
+    "ств": "ство",
+    "ству": "ство",
+    "ствам": "ство",
+    "ством": "ство",
+    "ствами": "ство",
+    "стве": "ство",
+    "ствах": "ство",
+}
+
+
+def apply_ru(token: str) -> Optional[str]:
+    "Apply pre-defined rules for Russian."
+    if len(token) < 10 or token[0].isupper() or "-" in token:
+        return None
+    for ending, base in RUSSIAN_ENDINGS.items():
+        if token.endswith(ending):
+            return token[: -len(ending)] + base
+    return None
diff --git a/simplemma/simplemma.py b/simplemma/simplemma.py
index d329753..d080089 100644
--- a/simplemma/simplemma.py
+++ b/simplemma/simplemma.py
@@ -10,7 +10,7 @@
 from typing import Any, Dict, List, Iterator, Optional, Tuple, Union
 
 try:
-    from .rules import apply_rules, RULES_LANGS
+    from .rules import apply_rules, GERMAN_PREFIXES, RULES_LANGS, RUSSIAN_PREFIXES
     from .tokenizer import simple_tokenizer
 # local error, also ModuleNotFoundError for Python >= 3.6
 except ImportError:  # pragma: no cover
@@ -98,8 +98,11 @@
 }
 BETTER_LOWER = {"bg", "es", "hy", "lt", "lv", "pt", "sk"}
 BUFFER_HACK = {"bg", "es", "et", "fi", "fr", "it", "lt", "pl", "sk"}  # "da", "nl"
+
+# TODO: This custom behavior has to be simplified before it becomes unmaintainable
 LONGER_AFFIXES = {"et", "fi", "hu", "lt"}
 SHORTER_GREEDY = {"bg", "et", "fi"}
+AFFIX_LANGS = {"bg", "et", "fi", "hu", "lt", "lv", "nb", "pl", "ru", "sk", "tr"}
 
 HYPHEN_REGEX = re.compile(r"([_-])")
 HYPHENS = {"-", "_"}
@@ -174,7 +177,7 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
                 if rule == columns[0]:
                     continue
                 elif rule is not None and rule != columns[1]:
-                    print(columns[1], columns[0], apply_rules(columns[1], langcode))
+                    print(columns[1], columns[0], rule)
             # process
             if columns[1] in mydict and mydict[columns[1]] != columns[0]:
                 # prevent mistakes and noise coming from the lists
@@ -417,6 +420,29 @@ def _affix_search(
     return candidate
 
 
+def _prefix_search(token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]:
+    # load prefixes
+    if lang == "de":
+        preflist = GERMAN_PREFIXES
+    elif lang == "ru":
+        preflist = RUSSIAN_PREFIXES
+    else:
+        return None
+    # apply
+    prefix = None
+    for p in preflist:
+        if token.startswith(p):
+            prefix = p
+            break
+    # decompose according to predefined prefix
+    if prefix is not None:
+        subword = _simple_search(token[len(prefix) :], datadict)
+        if subword is not None:
+            if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu":
+                return prefix + subword.lower()
+    return None
+
+
 def _suffix_search(token: str, datadict: Dict[str, str]) -> Optional[str]:
     lastcount = 0
     for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1):
@@ -451,13 +477,16 @@ def _return_lemma(
         if newcandidate is not None:
             candidate = newcandidate
     # stop here in some cases
-    if not greedy:
-        return candidate
+    # if not greedy:
+    #    return candidate
     limit = 6 if lang in SHORTER_GREEDY else 8
     if len(token) <= limit:
         return candidate
-    # greedy subword decomposition: suffix/affix search
+    # subword decomposition: predefined prefixes (absent from vocabulary if they are not words)
     if candidate is None:
+        candidate = _prefix_search(token, lang, datadict)  # type: ignore[arg-type]
+    # unsupervised suffix/affix search: not productive for all languages
+    if candidate is None and (greedy or lang in AFFIX_LANGS):
         # define parameters
         maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN
         # greedier subword decomposition: suffix search with character in between
@@ -465,8 +494,8 @@ def _return_lemma(
         candidate = _affix_search(token, datadict, maxlen) or _suffix_search(
             token, datadict
         )
-    # try further hops, not always a good idea
-    else:
+    # greedy mode: try further hops, not always a good idea
+    if candidate is not None and greedy:
         candidate = _greedy_search(candidate, datadict)
     return candidate
 
diff --git a/tests/test_rules.py b/tests/test_rules.py
index 2e3eb9c..ec6848e 100644
--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@@ -161,3 +161,4 @@ def test_apply_rules():
     assert apply_rules("brieven", "nl") == "brief"
     assert apply_rules("liikenaisessa", "fi") == "liikenainen"
     assert apply_rules("pracowaliście", "pl") == "pracować"
+    assert apply_rules("безгра́мотностью", "ru") == "безгра́мотность"
diff --git a/tests/test_simplemma.py b/tests/test_simplemma.py
index 16cc01e..3f8f302 100644
--- a/tests/test_simplemma.py
+++ b/tests/test_simplemma.py
@@ -171,6 +171,17 @@ def test_logic():
         == "getestet"
     )
 
+    # prefixes
+    mydata = simplemma.simplemma._load_data(("de", "ru"))
+    assert (
+        simplemma.simplemma._prefix_search("zerlemmatisiertes", "de", mydata[0].dict)
+        == "zerlemmatisiert"
+    )
+    assert (
+        simplemma.simplemma._prefix_search("зафиксированные", "ru", mydata[1].dict)
+        == "зафиксированный"
+    )
+
 
 def test_convenience():
     """Test convenience functions."""
@@ -355,6 +366,12 @@ def test_subwords():
     # assert lemmatize('Spargelstangen', lang='de', greedy=True) == 'Spargelstange'
     # assert lemmatize("Bandmitgliedern", lang="de", greedy=True) == "Bandmitglied"
 
+    # prefixes
+    assert lemmatize("lemmatisiertes", lang="de") == "lemmatisiert"
+    assert lemmatize("zerlemmatisiertes", lang="de") == "zerlemmatisiert"
+    assert lemmatize("фиксированные", lang="ru") == "фиксированный"
+    assert lemmatize("зафиксированные", lang="ru") == "зафиксированный"
+
 
 def test_tokenizer():
     # tokenization and chaining