forked from explosion/spaCy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_cli.py
163 lines (145 loc) · 5.52 KB
/
test_cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
from spacy.cli.converters import conllu2json, iob2json, conll_ner2json
from spacy.cli.pretrain import make_docs
def test_cli_converters_conllu2json():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
]
input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
sent = converted[0]["paragraphs"][0]["sentences"][0]
assert len(sent["tokens"]) == 4
tokens = sent["tokens"]
assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"]
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_iob2json():
lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
"I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
]
input_data = "\n".join(lines)
converted = iob2json(input_data, n_sents=10)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
for i in range(0, 4):
sent = converted[0]["paragraphs"][0]["sentences"][i]
assert len(sent["tokens"]) == 8
tokens = sent["tokens"]
# fmt: off
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
# fmt: on
def test_cli_converters_conll_ner2json():
lines = [
"-DOCSTART- -X- O O",
"",
"I\tO",
"like\tO",
"London\tB-GPE",
"and\tO",
"New\tB-GPE",
"York\tI-GPE",
"City\tI-GPE",
".\tO",
"",
"I O",
"like O",
"London B-GPE",
"and O",
"New B-GPE",
"York I-GPE",
"City I-GPE",
". O",
"",
"I PRP O",
"like VBP O",
"London NNP B-GPE",
"and CC O",
"New NNP B-GPE",
"York NNP I-GPE",
"City NNP I-GPE",
". . O",
"",
"I PRP _ O",
"like VBP _ O",
"London NNP _ B-GPE",
"and CC _ O",
"New NNP _ B-GPE",
"York NNP _ I-GPE",
"City NNP _ I-GPE",
". . _ O",
"",
"I\tPRP\t_\tO",
"like\tVBP\t_\tO",
"London\tNNP\t_\tB-GPE",
"and\tCC\t_\tO",
"New\tNNP\t_\tB-GPE",
"York\tNNP\t_\tI-GPE",
"City\tNNP\t_\tI-GPE",
".\t.\t_\tO",
]
input_data = "\n".join(lines)
converted = conll_ner2json(input_data, n_sents=10)
print(converted)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
for i in range(0, 5):
sent = converted[0]["paragraphs"][0]["sentences"][i]
assert len(sent["tokens"]) == 8
tokens = sent["tokens"]
# fmt: off
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
# fmt: on
def test_pretrain_make_docs():
nlp = English()
valid_jsonl_text = {"text": "Some text"}
docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
assert len(docs) == 1
assert skip_count == 0
valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
assert len(docs) == 1
assert skip_count == 0
invalid_jsonl_type = 0
with pytest.raises(TypeError):
make_docs(nlp, [invalid_jsonl_type], 1, 100)
invalid_jsonl_key = {"invalid": "Does not matter"}
with pytest.raises(ValueError):
make_docs(nlp, [invalid_jsonl_key], 1, 100)
empty_jsonl_text = {"text": ""}
docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
assert len(docs) == 0
assert skip_count == 1
empty_jsonl_tokens = {"tokens": []}
docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
assert len(docs) == 0
assert skip_count == 1
too_short_jsonl = {"text": "This text is not long enough"}
docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
assert len(docs) == 0
assert skip_count == 0
too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
assert len(docs) == 0
assert skip_count == 0