From 77488414550ca67eca7070cff8e891d2061c8237 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 31 Oct 2022 19:44:54 +0900 Subject: [PATCH 1/4] Fix handling of small docs in coref Docs with one or zero tokens fail in the coref component. This doesn't have a fix yet, just a failing test. (There is also a test for the span resolver, which does not fail.) --- spacy_experimental/coref/tests/test_coref.py | 5 +++-- spacy_experimental/coref/tests/test_span_resolver.py | 7 +++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/spacy_experimental/coref/tests/test_coref.py b/spacy_experimental/coref/tests/test_coref.py index 6ff07d4..4064bb3 100644 --- a/spacy_experimental/coref/tests/test_coref.py +++ b/spacy_experimental/coref/tests/test_coref.py @@ -83,11 +83,12 @@ def test_initialized(nlp): def test_initialized_short(nlp): + # test that short or empty docs don't fail nlp.add_pipe("experimental_coref") nlp.initialize() assert nlp.pipe_names == ["experimental_coref"] - text = "Hi there" - doc = nlp(text) + doc = nlp("Hi") + doc = nlp("") def test_coref_serialization(nlp): diff --git a/spacy_experimental/coref/tests/test_span_resolver.py b/spacy_experimental/coref/tests/test_span_resolver.py index d2d829d..e0238ee 100644 --- a/spacy_experimental/coref/tests/test_span_resolver.py +++ b/spacy_experimental/coref/tests/test_span_resolver.py @@ -79,6 +79,13 @@ def test_not_initialized(nlp): with pytest.raises(ValueError, match="E109"): nlp(text) +def test_initialized_short(nlp): + # docs with one or no tokens should not fail + nlp.add_pipe("experimental_span_resolver") + nlp.initialize() + assert nlp.pipe_names == ["experimental_span_resolver"] + nlp("hi") + nlp("") def test_span_resolver_serialization(nlp): # Test that the span resolver component can be serialized From 6db078c3a0d756fac94ef599c543711f7e96f2b1 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 31 Oct 2022 20:12:35 +0900 Subject: [PATCH 2/4] Add example short doc to tests It might be better to include this optionally? On the other hand, since it should just be ignored in training, having it always there is more thorough. --- spacy_experimental/coref/tests/test_coref.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy_experimental/coref/tests/test_coref.py b/spacy_experimental/coref/tests/test_coref.py index 4064bb3..ea6e67a 100644 --- a/spacy_experimental/coref/tests/test_coref.py +++ b/spacy_experimental/coref/tests/test_coref.py @@ -37,6 +37,11 @@ def generate_train_data(prefix=DEFAULT_CLUSTER_PREFIX): } }, ), + ( + # example short doc + "ok", + {"spans": {}} + ) ] # fmt: on return data From f37158584e6a2a1b1b7a12bb1608bcf9efe59ab7 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 31 Oct 2022 20:13:25 +0900 Subject: [PATCH 3/4] Skip short docs There can be no coref prediction for docs with one token (or no tokens). Attempting to treat docs like that normally causes a mess with size inference, so instead they're skipped. In training, this just involves skipping the docs in the update step. This is simple due to the fake batching structure, since the batch doesn't have to be maintained. In inference, this just involves short-circuiting to an empty prediction. --- spacy_experimental/coref/coref_component.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy_experimental/coref/coref_component.py b/spacy_experimental/coref/coref_component.py index 1e6788d..38ee7c1 100644 --- a/spacy_experimental/coref/coref_component.py +++ b/spacy_experimental/coref/coref_component.py @@ -145,6 +145,11 @@ def predict(self, docs: Iterable[Doc]) -> List[MentionClusters]: """ out = [] for doc in docs: + if len(doc) < 2: + # no coref in docs with 0 or 1 token + out.append([]) + continue + scores, idxs = self.model.predict([doc]) # idxs is a list of mentions (start / end idxs) # each item in scores includes scores and a mapping from scores to mentions @@ -232,6 +237,9 @@ def update( predicted docs in coref training. """ ) + if len(eg.predicted) < 2: + # no prediction possible for docs of length 0 or 1 + continue preds, backprop = self.model.begin_update([eg.predicted]) score_matrix, mention_idx = preds loss, d_scores = self.get_loss([eg], score_matrix, mention_idx) From 91a4fefebd92846a83b24354df689de0bad9baf2 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 31 Oct 2022 20:14:57 +0900 Subject: [PATCH 4/4] Clean up retokenization test The retokenization test is hard-coded to the the training example because it manually merges some tokens, to make sure that the prediction and merge line up. It would probably be better to separate out the training data from the general example here, but for now narrowing the training data works. --- spacy_experimental/coref/tests/test_coref.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy_experimental/coref/tests/test_coref.py b/spacy_experimental/coref/tests/test_coref.py index ea6e67a..31caf9a 100644 --- a/spacy_experimental/coref/tests/test_coref.py +++ b/spacy_experimental/coref/tests/test_coref.py @@ -154,7 +154,8 @@ def test_overfitting_IO(nlp, train_data): def test_tokenization_mismatch(nlp, train_data): train_examples = [] - for text, annot in train_data: + # this is testing a specific test example, so just get the first doc + for text, annot in train_data[0:1]: eg = Example.from_dict(nlp.make_doc(text), annot) ref = eg.reference char_spans = {}