Skip to content

Commit

Permalink
add rules for Latvian nouns (#154)
Browse files Browse the repository at this point in the history
* add rules for Latvian nouns

* code formatting

* better handling of Latvian

* fix tests
  • Loading branch information
adbar authored Nov 6, 2024
1 parent db5f5d8 commit 94931a8
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 3 deletions.
2 changes: 2 additions & 0 deletions simplemma/strategies/defaultrules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .de import apply_de
from .en import apply_en
from .fi import apply_fi
from .lv import apply_lv
from .nl import apply_nl
from .pl import apply_pl
from .ru import apply_ru
Expand All @@ -13,6 +14,7 @@
"de": apply_de,
"en": apply_en,
"fi": apply_fi,
"lv": apply_lv,
"nl": apply_nl,
"pl": apply_pl,
"ru": apply_ru,
Expand Down
45 changes: 45 additions & 0 deletions simplemma/strategies/defaultrules/lv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import re
from typing import Optional

from .generic import apply_rules

# https://en.wiktionary.org/wiki/Category:Latvian_suffixes

DEFAULT_RULES = {
# feminine nouns
re.compile(r"(?:āju|ājas|ājai|ājam|ājās)$"): "āja",
re.compile(r"(?:ēju|ējas|ējai|ējam|ējās)$"): "ēja",
re.compile(r"(?:ieci|ieces|iecei|iecē|ieču|iecēm|iecēs)$"): "iece",
re.compile(r"(?:ieti|ietes|ietei|ietē|ietes|iešu|ietēm|ietēs)$"): "iete",
re.compile(r"(?:iju|ijas|ijai|ijam)$"): "ija",
re.compile(r"(?:ību|ības|ībai|ībām|ībās)$"): "ība",
re.compile(r"(?:īgu|īga|īgam|īgi|īgus|īgiem|īgos|īgas|īgai|īgā|īgām|īgās)$"): "īgs",
re.compile(r"(?:īva|īvu|īvam|īvas|īvai|īvus|īviem|īvos|īvā|īvām|īvās)$"): "īvs",
re.compile(r"(?:šanu|šanas|šanai|šanā|šanām|šanās)$"): "šana",
re.compile(r"(?:umu|uma|umam|umā|umām|umās)$"): "ums", # |um
# masculine nouns
re.compile(r"(?:āju|āja|ājam|āj|āji|ājus|ājiem|ājos)$"): "ājs",
re.compile(r"(?:iņu|iņa|iņam|iņ|iņi|iņus|iņiem|iņos)$"): "iņš",
re.compile(
r"(?:isku|iska|iskam|iskā|iski|iskus|iskiem|isko|iskos|iskai|iskas|iskām|iskās)$"
): "isks",
re.compile(r"(?:ismu|isma|ismam|ismā|iski|ism)$"): "isms",
re.compile(r"(?:īti|īša|ītim|ītī|īt|īši|īšus|īšu|īšiem|īšos)$"): "ītis",
re.compile(r"(?:kli|kļa|klim|klī|kļi|kļus|kļiem|kļos)$"): "klis",
re.compile(r"(?:nieku|nieka|niekam|niekā|nieki|niekus|niekiem|niekos)$"): "nieks",
re.compile(r"(?:ni|ņa|nim|nī|ņi|ņus|ņu|ņiem|ņos)$"): "nis",
# fallback
re.compile(r"(?:as|ai|ā|ām|ās)$"): "a",
re.compile(r"(?:ei|es|ē|ēm|ēs)$"): "e",
re.compile(r"(?:is|im|ī|iem|īs)$"): "is",
# re.compile(r"(?:os|us)$"): "s",
# re.compile(r"(?:ēto|ēts)$"): "ēt",
}


def apply_lv(token: str) -> Optional[str]:
"Apply pre-defined rules for Latvian."
if len(token) < 5:
return None

return apply_rules(token, DEFAULT_RULES)
Binary file modified simplemma/strategies/dictionaries/data/lv.plzma
Binary file not shown.
2 changes: 1 addition & 1 deletion simplemma/strategies/greedy_dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
from .lemmatization_strategy import LemmatizationStrategy

SHORTER_GREEDY = {"bg", "et", "fi"}
SHORTER_GREEDY = {"bg", "et", "fi", "lv"}


class GreedyDictionaryLookupStrategy(LemmatizationStrategy):
Expand Down
3 changes: 3 additions & 0 deletions tests/strategies/defaultrules/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ def test_DEFAULT_RULES() -> None:
assert rules_strategy.get_lemma("liikenaisessa", "fi") == "liikenainen"
assert rules_strategy.get_lemma("pracowaliście", "pl") == "pracować"
assert rules_strategy.get_lemma("безгра́мотностью", "ru") == "безгра́мотность"
assert rules_strategy.get_lemma("Rīga", "lv") is None
assert rules_strategy.get_lemma("šķirkļiem", "lv") == "šķirklis"
assert rules_strategy.get_lemma("mācībām", "lv") == "mācība"
1 change: 1 addition & 0 deletions tests/test_dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def test_logic() -> None:
# log warning
mydict = dictionary_pickler._read_dict(testfile, "zz", silent=False)
assert len(mydict) == 3

# different length
mydict = dictionary_pickler._read_dict(testfile, "en", silent=True)
assert len(mydict) == 5
Expand Down
5 changes: 3 additions & 2 deletions training/dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

LOGGER = logging.getLogger(__name__)

INPUT_PUNCT = re.compile(r"[,:*/\+_]|^-|-\t")
INPUT_PUNCT = re.compile(r"[,:*/\+_]|.+-$|.+-\t|^-.+")
SAFE_LIMIT = {
"cs",
"da",
Expand All @@ -31,6 +31,7 @@
"ga",
"hu",
"it",
"lv",
"pl",
"pt",
"ru",
Expand Down Expand Up @@ -86,7 +87,7 @@ def _read_dict(
and columns[1] != columns[0]
):
rule = DEFAULT_RULES[langcode](columns[1])
if rule is not None and rule != columns[1]:
if rule and rule != columns[0]:
print(columns[1], columns[0], rule)
# process
if columns[1] in mydict and mydict[columns[1]] != columns[0]:
Expand Down

0 comments on commit 94931a8

Please sign in to comment.