-
Notifications
You must be signed in to change notification settings - Fork 0
/
tests.py
138 lines (103 loc) · 3.89 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pytest
from hypothesis import strategies as st, given, settings
from finntk.omor.extract import (
extract_lemmas,
extract_lemmas_combs,
extract_lemmas_recurs,
extract_lemmas_span,
)
from finntk.wordnet.reader import fiwn
from scipy.spatial.distance import cosine
import heapq
import itertools
def intersect(*its):
for key, values in itertools.groupby(heapq.merge(*its)):
if len(list(values)) == len(its):
yield key
@pytest.mark.parametrize(
"compound, expected_lemmas",
[
pytest.param("merenranta", {"merenranta", "meri", "ranta"}, id="merenranta"),
pytest.param(
"koneapulainen", {"koneapulainen", "kone", "apulainen"}, id="koneapulainen"
),
pytest.param(
"voileipäkakku", {"voi", "voileipä", "voileipäkakku"}, id="voileipäkakku"
),
pytest.param(
"naissukupuoli",
{"nainen", "sukupuoli", "suku", "puoli", "naissukupuoli"},
id="naissukupuoli",
),
],
)
def test_lemmas_combs(compound, expected_lemmas):
actual_lemmas = extract_lemmas_combs(compound)
assert actual_lemmas.issuperset(expected_lemmas)
@pytest.mark.parametrize(
"compound, expected_lemmas",
[
pytest.param(
"synnyinkaupunkini", {"synty", "synnyin", "syntyä"}, id="synnyinkaupunkini"
)
],
)
def test_lemmas_recurs(compound, expected_lemmas):
actual_lemmas = extract_lemmas_recurs(compound)
assert actual_lemmas.issuperset(expected_lemmas)
@pytest.mark.parametrize(
"form, expected",
[
pytest.param("en", "ei", id="en"),
pytest.param("pyykinpesuun", "pyykinpesu", id="pyykinpesuun"),
],
)
def test_extract_lemmas_span(form, expected):
assert extract_lemmas_span(form) == {expected}
@pytest.mark.parametrize("brace", [pytest.param("["), pytest.param("]")])
def test_braces_roundtrip(brace):
assert extract_lemmas(brace) == {brace}
def fiwn_conceptnet_common_lemmas():
CONCEPTNET_FI = "/c/fi/"
from finntk.emb.numberbatch import vecs as numberbatch_vecs
vecs = numberbatch_vecs.get_vecs()
def fi_lemmas():
for entity in vecs.index2entity:
if entity.startswith(CONCEPTNET_FI):
yield entity[len(CONCEPTNET_FI) :]
return intersect(fiwn.all_lemma_names(), fi_lemmas())
fiwn_conceptnet_common_lemmas_300 = [
x for _, x in zip(range(300), fiwn_conceptnet_common_lemmas())
]
@settings(deadline=None)
@given(st.sampled_from(fiwn_conceptnet_common_lemmas_300))
def test_get_lemma_vec(lemma_name):
from finntk.emb.autoextend import mk_lemma_vec
for lemma in fiwn.lemmas(lemma_name):
assert mk_lemma_vec(lemma) is not None
@given(st.sampled_from(fiwn_conceptnet_common_lemmas_300))
def test_get_synset_vec(lemma_name):
from finntk.emb.autoextend import mk_synset_vec
synset = fiwn.lemmas(lemma_name)[0].synset()
assert mk_synset_vec(synset) is not None
@given(st.one_of(st.just("pitää"), st.just("saada")))
def test_surf_vec_matches(surf):
from finntk.emb.autoextend import vecs as autoextend_vecs
from finntk.emb.numberbatch import mk_concept_vec
assert cosine(mk_concept_vec("fi", surf), autoextend_vecs.get_vecs()[surf]) < 0.01
def test_no_extra_lemmas():
vararengas_lemmas = extract_lemmas_span("vararengas")
assert "vara_2rengas" not in vararengas_lemmas
def test_extract_true_lemmas_span_no_clobber_minen():
from finntk.omor.extract import extract_true_lemmas_span
span_lemmas = extract_true_lemmas_span("tuleminen")
assert "tulla" not in span_lemmas
assert "tuleminen" in span_lemmas
def test_extract_true_lemmas_span_mista():
from finntk.omor.extract import extract_true_lemmas_span
mista_lemmas = extract_true_lemmas_span("mistä")
assert "mikä" in mista_lemmas
feats = mista_lemmas["mikä"]
assert len(feats) >= 1
for feat in feats:
assert ("case", "ELA") in feat