Skip to content

Commit

Permalink
adjust affix search, add rules and prefixes for DE & RU (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 17, 2023
1 parent 369f075 commit 24cb992
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 27 deletions.
136 changes: 116 additions & 20 deletions simplemma/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,30 @@
from typing import Optional


RULES_LANGS = {"de", "en", "fi", "nl"}
RULES_LANGS = {"de", "en", "fi", "nl", "pl", "ru"}

# VOWELS = {"a", "e", "i", "o", "u", "y"}


def apply_rules(
token: str, langcode: Optional[str], greedy: bool = False
) -> Optional[str]:
"Apply pre-defined rules for certain languages."
candidate = None
if langcode == "de":
candidate = apply_de(token, greedy)
elif langcode == "en":
candidate = apply_en(token)
elif langcode == "fi":
candidate = apply_fi(token)
elif langcode == "nl":
candidate = apply_nl(token)
elif langcode == "pl":
candidate = apply_pl(token)
elif langcode == "ru":
candidate = apply_ru(token)
return candidate


NOUN_ENDINGS_DE = re.compile(
r"(?:bold|[^kl]ling|ment)(e?[ns]?)?$|"
Expand All @@ -31,25 +54,62 @@
ENDING_CHARS_ADJ_DE = ENDING_CHARS_NN_DE.union({"d", "t"})
ENDING_DE = re.compile(r"(?:e|em|en|er|es)$")

# VOWELS = {"a", "e", "i", "o", "u", "y"}


def apply_rules(
token: str, langcode: Optional[str], greedy: bool = False
) -> Optional[str]:
"Apply pre-defined rules for certain languages."
candidate = None
if langcode == "de":
candidate = apply_de(token, greedy)
elif langcode == "en":
candidate = apply_en(token)
elif langcode == "fi":
candidate = apply_fi(token)
elif langcode == "nl":
candidate = apply_nl(token)
elif langcode == "pl":
candidate = apply_pl(token)
return candidate
# 2-letter prefixes are theoretically already accounted for by the current AFFIXLEN parameter
GERMAN_PREFIXES = {
"ab",
"an",
"auf",
"aus",
"be",
"bei",
"da",
"dar",
"durch",
"ein",
"ent",
"er",
"gegen",
"her",
"heran",
"herab",
"herauf",
"heraus",
"herein",
"herum",
"herunter",
"hervor",
"hin",
"hinauf",
"hinaus",
"hinein",
"hinter",
"hinunter",
"hinweg",
"hinzu",
"los",
"miss",
"mit",
"nach",
"neben",
"ran",
"raus",
"rein",
"rum",
"runter",
"über",
"unter",
"ver",
"vor",
"voran",
"voraus",
"vorbei",
"vorher",
"vorüber",
"weg",
"weiter",
"wieder",
"zer",
}


def apply_de(token: str, greedy: bool = False) -> Optional[str]:
Expand Down Expand Up @@ -405,3 +465,39 @@ def apply_pl(token: str) -> Optional[str]:
if token.endswith(ending):
return token[: -len(ending)] + base
return None


RUSSIAN_PREFIXES = {"за", "много", "недо", "пере", "пред", "само"}

RUSSIAN_ENDINGS = {
# -ость
"ости": "ость",
"остью": "ость",
"остию": "ость",
"остьи": "ость",
"остии": "ость",
"остьхъ": "ость",
"остьма": "ость",
"остьмъ": "ость",
"остиѭ": "ость",
"остьми": "ость",
# -ство
"ства": "ство",
"ств": "ство",
"ству": "ство",
"ствам": "ство",
"ством": "ство",
"ствами": "ство",
"стве": "ство",
"ствах": "ство",
}


def apply_ru(token: str) -> Optional[str]:
"Apply pre-defined rules for Russian."
if len(token) < 10 or token[0].isupper() or "-" in token:
return None
for ending, base in RUSSIAN_ENDINGS.items():
if token.endswith(ending):
return token[: -len(ending)] + base
return None
43 changes: 36 additions & 7 deletions simplemma/simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import Any, Dict, List, Iterator, Optional, Tuple, Union

try:
from .rules import apply_rules, RULES_LANGS
from .rules import apply_rules, GERMAN_PREFIXES, RULES_LANGS, RUSSIAN_PREFIXES
from .tokenizer import simple_tokenizer
# local error, also ModuleNotFoundError for Python >= 3.6
except ImportError: # pragma: no cover
Expand Down Expand Up @@ -98,8 +98,11 @@
}
BETTER_LOWER = {"bg", "es", "hy", "lt", "lv", "pt", "sk"}
BUFFER_HACK = {"bg", "es", "et", "fi", "fr", "it", "lt", "pl", "sk"} # "da", "nl"

# TODO: This custom behavior has to be simplified before it becomes unmaintainable
LONGER_AFFIXES = {"et", "fi", "hu", "lt"}
SHORTER_GREEDY = {"bg", "et", "fi"}
AFFIX_LANGS = {"bg", "et", "fi", "hu", "lt", "lv", "nb", "pl", "ru", "sk", "tr"}

HYPHEN_REGEX = re.compile(r"([_-])")
HYPHENS = {"-", "_"}
Expand Down Expand Up @@ -174,7 +177,7 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
if rule == columns[0]:
continue
elif rule is not None and rule != columns[1]:
print(columns[1], columns[0], apply_rules(columns[1], langcode))
print(columns[1], columns[0], rule)
# process
if columns[1] in mydict and mydict[columns[1]] != columns[0]:
# prevent mistakes and noise coming from the lists
Expand Down Expand Up @@ -417,6 +420,29 @@ def _affix_search(
return candidate


def _prefix_search(token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]:
# load prefixes
if lang == "de":
preflist = GERMAN_PREFIXES
elif lang == "ru":
preflist = RUSSIAN_PREFIXES
else:
return None
# apply
prefix = None
for p in preflist:
if token.startswith(p):
prefix = p
break
# decompose according to predefined prefix
if prefix is not None:
subword = _simple_search(token[len(prefix) :], datadict)
if subword is not None:
if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu":
return prefix + subword.lower()
return None


def _suffix_search(token: str, datadict: Dict[str, str]) -> Optional[str]:
lastcount = 0
for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1):
Expand Down Expand Up @@ -451,22 +477,25 @@ def _return_lemma(
if newcandidate is not None:
candidate = newcandidate
# stop here in some cases
if not greedy:
return candidate
# if not greedy:
# return candidate
limit = 6 if lang in SHORTER_GREEDY else 8
if len(token) <= limit:
return candidate
# greedy subword decomposition: suffix/affix search
# subword decomposition: predefined prefixes (absent from vocabulary if they are not words)
if candidate is None:
candidate = _prefix_search(token, lang, datadict) # type: ignore[arg-type]
# unsupervised suffix/affix search: not productive for all languages
if candidate is None and (greedy or lang in AFFIX_LANGS):
# define parameters
maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN
# greedier subword decomposition: suffix search with character in between
# then suffixes
candidate = _affix_search(token, datadict, maxlen) or _suffix_search(
token, datadict
)
# try further hops, not always a good idea
else:
# greedy mode: try further hops, not always a good idea
if candidate is not None and greedy:
candidate = _greedy_search(candidate, datadict)
return candidate

Expand Down
1 change: 1 addition & 0 deletions tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,4 @@ def test_apply_rules():
assert apply_rules("brieven", "nl") == "brief"
assert apply_rules("liikenaisessa", "fi") == "liikenainen"
assert apply_rules("pracowaliście", "pl") == "pracować"
assert apply_rules("безгра́мотностью", "ru") == "безгра́мотность"
17 changes: 17 additions & 0 deletions tests/test_simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,17 @@ def test_logic():
== "getestet"
)

# prefixes
mydata = simplemma.simplemma._load_data(("de", "ru"))
assert (
simplemma.simplemma._prefix_search("zerlemmatisiertes", "de", mydata[0].dict)
== "zerlemmatisiert"
)
assert (
simplemma.simplemma._prefix_search("зафиксированные", "ru", mydata[1].dict)
== "зафиксированный"
)


def test_convenience():
"""Test convenience functions."""
Expand Down Expand Up @@ -355,6 +366,12 @@ def test_subwords():
# assert lemmatize('Spargelstangen', lang='de', greedy=True) == 'Spargelstange'
# assert lemmatize("Bandmitgliedern", lang="de", greedy=True) == "Bandmitglied"

# prefixes
assert lemmatize("lemmatisiertes", lang="de") == "lemmatisiert"
assert lemmatize("zerlemmatisiertes", lang="de") == "zerlemmatisiert"
assert lemmatize("фиксированные", lang="ru") == "фиксированный"
assert lemmatize("зафиксированные", lang="ru") == "зафиксированный"


def test_tokenizer():
# tokenization and chaining
Expand Down

0 comments on commit 24cb992

Please sign in to comment.