From 77488414550ca67eca7070cff8e891d2061c8237 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 31 Oct 2022 19:44:54 +0900
Subject: [PATCH 1/4] Fix handling of small docs in coref

Docs with one or zero tokens fail in the coref component. This doesn't
have a fix yet, just a failing test. (There is also a test for the span
resolver, which does not fail.)
---
 spacy_experimental/coref/tests/test_coref.py         | 5 +++--
 spacy_experimental/coref/tests/test_span_resolver.py | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/spacy_experimental/coref/tests/test_coref.py b/spacy_experimental/coref/tests/test_coref.py
index 6ff07d4..4064bb3 100644
--- a/spacy_experimental/coref/tests/test_coref.py
+++ b/spacy_experimental/coref/tests/test_coref.py
@@ -83,11 +83,12 @@ def test_initialized(nlp):
 
 
 def test_initialized_short(nlp):
+    # test that short or empty docs don't fail
     nlp.add_pipe("experimental_coref")
     nlp.initialize()
     assert nlp.pipe_names == ["experimental_coref"]
-    text = "Hi there"
-    doc = nlp(text)
+    doc = nlp("Hi")
+    doc = nlp("")
 
 
 def test_coref_serialization(nlp):
diff --git a/spacy_experimental/coref/tests/test_span_resolver.py b/spacy_experimental/coref/tests/test_span_resolver.py
index d2d829d..e0238ee 100644
--- a/spacy_experimental/coref/tests/test_span_resolver.py
+++ b/spacy_experimental/coref/tests/test_span_resolver.py
@@ -79,6 +79,13 @@ def test_not_initialized(nlp):
     with pytest.raises(ValueError, match="E109"):
         nlp(text)
 
+def test_initialized_short(nlp):
+    # docs with one or no tokens should not fail
+    nlp.add_pipe("experimental_span_resolver")
+    nlp.initialize()
+    assert nlp.pipe_names == ["experimental_span_resolver"]
+    nlp("hi")
+    nlp("")
 
 def test_span_resolver_serialization(nlp):
     # Test that the span resolver component can be serialized

From 6db078c3a0d756fac94ef599c543711f7e96f2b1 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 31 Oct 2022 20:12:35 +0900
Subject: [PATCH 2/4] Add example short doc to tests

It might be better to include this optionally? On the other hand, since
it should just be ignored in training, having it always there is more
thorough.
---
 spacy_experimental/coref/tests/test_coref.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy_experimental/coref/tests/test_coref.py b/spacy_experimental/coref/tests/test_coref.py
index 4064bb3..ea6e67a 100644
--- a/spacy_experimental/coref/tests/test_coref.py
+++ b/spacy_experimental/coref/tests/test_coref.py
@@ -37,6 +37,11 @@ def generate_train_data(prefix=DEFAULT_CLUSTER_PREFIX):
                 }
             },
         ),
+        (
+            # example short doc
+            "ok",
+            {"spans": {}}
+        )
     ]
     # fmt: on
     return data

From f37158584e6a2a1b1b7a12bb1608bcf9efe59ab7 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 31 Oct 2022 20:13:25 +0900
Subject: [PATCH 3/4] Skip short docs

There can be no coref prediction for docs with one token (or no tokens).
Attempting to treat docs like that normally causes a mess with size
inference, so instead they're skipped.

In training, this just involves skipping the docs in the update step.
This is simple due to the fake batching structure, since the batch
doesn't have to be maintained.

In inference, this just involves short-circuiting to an empty
prediction.
---
 spacy_experimental/coref/coref_component.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/spacy_experimental/coref/coref_component.py b/spacy_experimental/coref/coref_component.py
index 1e6788d..38ee7c1 100644
--- a/spacy_experimental/coref/coref_component.py
+++ b/spacy_experimental/coref/coref_component.py
@@ -145,6 +145,11 @@ def predict(self, docs: Iterable[Doc]) -> List[MentionClusters]:
         """
         out = []
         for doc in docs:
+            if len(doc) < 2:
+                # no coref in docs with 0 or 1 token
+                out.append([])
+                continue
+
             scores, idxs = self.model.predict([doc])
             # idxs is a list of mentions (start / end idxs)
             # each item in scores includes scores and a mapping from scores to mentions
@@ -232,6 +237,9 @@ def update(
                     predicted docs in coref training.
                     """
                 )
+            if len(eg.predicted) < 2:
+                # no prediction possible for docs of length 0 or 1
+                continue
             preds, backprop = self.model.begin_update([eg.predicted])
             score_matrix, mention_idx = preds
             loss, d_scores = self.get_loss([eg], score_matrix, mention_idx)

From 91a4fefebd92846a83b24354df689de0bad9baf2 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 31 Oct 2022 20:14:57 +0900
Subject: [PATCH 4/4] Clean up retokenization test

The retokenization test is hard-coded to the the training example
because it manually merges some tokens, to make sure that the prediction
and merge line up. It would probably be better to separate out the
training data from the general example here, but for now narrowing the
training data works.
---
 spacy_experimental/coref/tests/test_coref.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy_experimental/coref/tests/test_coref.py b/spacy_experimental/coref/tests/test_coref.py
index ea6e67a..31caf9a 100644
--- a/spacy_experimental/coref/tests/test_coref.py
+++ b/spacy_experimental/coref/tests/test_coref.py
@@ -154,7 +154,8 @@ def test_overfitting_IO(nlp, train_data):
 
 def test_tokenization_mismatch(nlp, train_data):
     train_examples = []
-    for text, annot in train_data:
+    # this is testing a specific test example, so just get the first doc
+    for text, annot in train_data[0:1]:
         eg = Example.from_dict(nlp.make_doc(text), annot)
         ref = eg.reference
         char_spans = {}