Skip to content

Commit

Permalink
improve RU rules and UK language data (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 18, 2023
1 parent 24cb992 commit 68abbaa
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 9 deletions.
Binary file modified simplemma/data/ru.plzma
Binary file not shown.
Binary file modified simplemma/data/uk.plzma
Binary file not shown.
38 changes: 29 additions & 9 deletions simplemma/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,20 +467,35 @@ def apply_pl(token: str) -> Optional[str]:
return None


RUSSIAN_PREFIXES = {"за", "много", "недо", "пере", "пред", "само"}
RUSSIAN_PREFIXES = {
"гидро",
"за",
"контр",
"много",
"микро",
"недо",
"пере",
"под",
"пред",
"при",
"про",
"радио",
"раз",
"рас",
"само",
"экстра",
"электро",
}


RUSSIAN_ENDINGS = {
# -ость
"ости": "ость",
"остью": "ость",
"остию": "ость",
"остьи": "ость",
"остии": "ость",
"остьхъ": "ость",
"остьма": "ость",
"остьмъ": "ость",
"остиѭ": "ость",
"остьми": "ость",
"остей": "ость",
"остям": "ость",
"остями": "ость",
"остях": "ость",
# -ство
"ства": "ство",
"ств": "ство",
Expand All @@ -495,8 +510,13 @@ def apply_pl(token: str) -> Optional[str]:

def apply_ru(token: str) -> Optional[str]:
"Apply pre-defined rules for Russian."
if token.endswith("ё"):
return token.replace("ё", "е")
if len(token) < 10 or token[0].isupper() or "-" in token:
return None
# token = token.replace("а́", "a")
# token = token.replace("о́", "o")
# token = token.replace("и́", "и")
for ending, base in RUSSIAN_ENDINGS.items():
if token.endswith(ending):
return token[: -len(ending)] + base
Expand Down

0 comments on commit 68abbaa

Please sign in to comment.