Skip to content

Commit

Permalink
Format
Browse files Browse the repository at this point in the history
  • Loading branch information
honnibal committed Sep 9, 2024
1 parent b65491b commit 59ac7e6
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 18 deletions.
41 changes: 37 additions & 4 deletions spacy/lang/bo/lex_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,43 @@

_num_words = [
"ཀླད་ཀོར་",
"གཅིག་", "གཉིས་", "གསུམ་", "བཞི་", "ལྔ་", "དྲུག་", "བདུན་", "བརྒྱད་", "དགུ་", "བཅུ་",
"བཅུ་གཅིག་", "བཅུ་གཉིས་", "བཅུ་གསུམ་", "བཅུ་བཞི་", "བཅུ་ལྔ་", "བཅུ་དྲུག་", "བཅུ་བདུན་", "བཅུ་པརྒྱད", "བཅུ་དགུ་", "ཉི་ཤུ་",
"སུམ་ཅུ", "བཞི་བཅུ", "ལྔ་བཅུ", "དྲུག་ཅུ", "བདུན་ཅུ", "བརྒྱད་ཅུ", "དགུ་བཅུ", "བརྒྱ་",
"སྟོང་", "ཁྲི་", "ས་ཡ་", " བྱེ་བ་", "དུང་ཕྱུར་", "ཐེར་འབུམ་", "ཐེར་འབུམ་ཆེན་པོ་", "ཁྲག་ཁྲིག་", "ཁྲག་ཁྲིག་ཆེན་པོ་",
"གཅིག་",
"གཉིས་",
"གསུམ་",
"བཞི་",
"ལྔ་",
"དྲུག་",
"བདུན་",
"བརྒྱད་",
"དགུ་",
"བཅུ་",
"བཅུ་གཅིག་",
"བཅུ་གཉིས་",
"བཅུ་གསུམ་",
"བཅུ་བཞི་",
"བཅུ་ལྔ་",
"བཅུ་དྲུག་",
"བཅུ་བདུན་",
"བཅུ་པརྒྱད",
"བཅུ་དགུ་",
"ཉི་ཤུ་",
"སུམ་ཅུ",
"བཞི་བཅུ",
"ལྔ་བཅུ",
"དྲུག་ཅུ",
"བདུན་ཅུ",
"བརྒྱད་ཅུ",
"དགུ་བཅུ",
"བརྒྱ་",
"སྟོང་",
"ཁྲི་",
"ས་ཡ་",
" བྱེ་བ་",
"དུང་ཕྱུར་",
"ཐེར་འབུམ་",
"ཐེར་འབུམ་ཆེན་པོ་",
"ཁྲག་ཁྲིག་",
"ཁྲག་ཁྲིག་ཆེན་པོ་",
]


Expand Down
4 changes: 3 additions & 1 deletion spacy/lang/gd/stop_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,5 +382,7 @@
ì
ò
ó
""".split("\n")
""".split(
"\n"
)
)
23 changes: 12 additions & 11 deletions spacy/lang/gd/tokenizer_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,18 @@
"càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}],
"sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}],
"orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}],
"fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}]
"fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}],
}


# Hyphenations that are alternative forms of words
for exc_data in [
{ORTH: "fa-near",NORM: "fainear"},
{ORTH: "Fa-near",NORM: "Fainear"},
{ORTH: "fa-near", NORM: "fainear"},
{ORTH: "Fa-near", NORM: "Fainear"},
]:
_exc[exc_data[ORTH]] = [exc_data]





# Abreviations and shortened words
for exc_data in [
{ORTH: "'", NORM: "a"},
Expand Down Expand Up @@ -1529,7 +1528,7 @@
Òige-sa
òrd-mhòr
Òrd-mhòr""".split():
_exc[orth] = [{ORTH: orth}]
_exc[orth] = [{ORTH: orth}]

# Multiple words that should remain as one token
for orth in """'n diugh
Expand Down Expand Up @@ -1975,8 +1974,10 @@
tuilleadh 's a chòir
Tuilleadh 's a chòir
tuilleadh sa chòir
Tuilleadh sa chòir""".split("\n"):
_exc[orth] = [{ORTH: orth}]

Tuilleadh sa chòir""".split(
"\n"
):
_exc[orth] = [{ORTH: orth}]


TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
1 change: 1 addition & 0 deletions spacy/lang/kmr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ class Kurmanji(Language):
lang = "kmr"
Defaults = KurmanjiDefaults


__all__ = ["Kurmanji"]
2 changes: 2 additions & 0 deletions spacy/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,12 @@ def bg_tokenizer():
def bn_tokenizer():
return get_lang_class("bn")().tokenizer


@pytest.fixture(scope="session")
def bo_tokenizer():
return get_lang_class("bo")().tokenizer


@pytest.fixture(scope="session")
def ca_tokenizer():
return get_lang_class("ca")().tokenizer
Expand Down
2 changes: 1 addition & 1 deletion spacy/tests/lang/bo/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
def test_lex_attrs_like_number(bo_tokenizer, text, match):
tokens = bo_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match
assert tokens[0].like_num == match
13 changes: 12 additions & 1 deletion spacy/tests/lang/kmr/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,18 @@


@pytest.mark.parametrize(
"word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
"word",
[
"yekem",
"duyemîn",
"100em",
"dehem",
"sedemîn",
"34em",
"30yem",
"20emîn",
"50yemîn",
],
)
def test_kmr_lex_attrs_like_number_for_ordinal(word):
assert like_num(word)
Expand Down

0 comments on commit 59ac7e6

Please sign in to comment.