From 3558a1084e2969fbb5f3f731683dca3580ea8cf4 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 30 May 2022 18:16:45 +0900
Subject: [PATCH 01/12] Move coref scoring code to scorer.py

Includes some renames to make names less generic.
---
 licenses/3rd_party_licenses.txt |  33 +++++++++
 spacy/coref_scorer.py           | 124 -------------------------------
 spacy/pipeline/coref.py         |   4 +-
 spacy/scorer.py                 | 126 ++++++++++++++++++++++++++++++++
 4 files changed, 161 insertions(+), 126 deletions(-)
 delete mode 100644 spacy/coref_scorer.py

diff --git a/licenses/3rd_party_licenses.txt b/licenses/3rd_party_licenses.txt
index d58da9c4a6b..c605c40b95c 100644
--- a/licenses/3rd_party_licenses.txt
+++ b/licenses/3rd_party_licenses.txt
@@ -127,3 +127,36 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+
+coval
+-----
+
+* Files: scorer.py
+
+The implementations of ClusterEvaluator, lea, get_cluster_info, and
+get_markable_assignments are adapted from coval, which is distributed
+under the following license:
+
+The MIT License (MIT)
+
+Copyright 2018 Nafise Sadat Moosavi (ns.moosavi at gmail dot com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/spacy/coref_scorer.py b/spacy/coref_scorer.py
deleted file mode 100644
index 981b1cf03f8..00000000000
--- a/spacy/coref_scorer.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# copied from coval
-# https://github.com/ns-moosavi/coval
-
-
-def get_cluster_info(predicted_clusters, gold_clusters):
-    p2g = get_markable_assignments(predicted_clusters, gold_clusters)
-    g2p = get_markable_assignments(gold_clusters, predicted_clusters)
-    # this is the data format used as input by the evaluator
-    return (gold_clusters, predicted_clusters, g2p, p2g)
-
-
-def get_markable_assignments(in_clusters, out_clusters):
-    markable_cluster_ids = {}
-    out_dic = {}
-    for cluster_id, cluster in enumerate(out_clusters):
-        for m in cluster:
-            out_dic[m] = cluster_id
-
-    for cluster in in_clusters:
-        for im in cluster:
-            for om in out_dic:
-                if im == om:
-                    markable_cluster_ids[im] = out_dic[om]
-                    break
-
-    return markable_cluster_ids
-
-
-def f1(p_num, p_den, r_num, r_den, beta=1):
-    p = 0 if p_den == 0 else p_num / float(p_den)
-    r = 0 if r_den == 0 else r_num / float(r_den)
-    return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
-
-
-class Evaluator:
-    def __init__(self, metric, beta=1, keep_aggregated_values=False):
-        self.p_num = 0
-        self.p_den = 0
-        self.r_num = 0
-        self.r_den = 0
-        self.metric = metric
-        self.beta = beta
-        self.keep_aggregated_values = keep_aggregated_values
-
-        if keep_aggregated_values:
-            self.aggregated_p_num = []
-            self.aggregated_p_den = []
-            self.aggregated_r_num = []
-            self.aggregated_r_den = []
-
-    def update(self, coref_info):
-        (
-            key_clusters,
-            sys_clusters,
-            key_mention_sys_cluster,
-            sys_mention_key_cluster,
-        ) = coref_info
-
-        pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster)
-        rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster)
-        self.p_num += pn
-        self.p_den += pd
-        self.r_num += rn
-        self.r_den += rd
-
-        if self.keep_aggregated_values:
-            self.aggregated_p_num.append(pn)
-            self.aggregated_p_den.append(pd)
-            self.aggregated_r_num.append(rn)
-            self.aggregated_r_den.append(rd)
-
-    def get_f1(self):
-        return f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=self.beta)
-
-    def get_recall(self):
-        return 0 if self.r_num == 0 else self.r_num / float(self.r_den)
-
-    def get_precision(self):
-        return 0 if self.p_num == 0 else self.p_num / float(self.p_den)
-
-    def get_prf(self):
-        return self.get_precision(), self.get_recall(), self.get_f1()
-
-    def get_counts(self):
-        return self.p_num, self.p_den, self.r_num, self.r_den
-
-    def get_aggregated_values(self):
-        return (
-            self.aggregated_p_num,
-            self.aggregated_p_den,
-            self.aggregated_r_num,
-            self.aggregated_r_den,
-        )
-
-
-def lea(input_clusters, output_clusters, mention_to_gold):
-    num, den = 0, 0
-
-    for c in input_clusters:
-        if len(c) == 1:
-            all_links = 1
-            if (
-                c[0] in mention_to_gold
-                and len(output_clusters[mention_to_gold[c[0]]]) == 1
-            ):
-                common_links = 1
-            else:
-                common_links = 0
-        else:
-            common_links = 0
-            all_links = len(c) * (len(c) - 1) / 2.0
-            for i, m in enumerate(c):
-                if m in mention_to_gold:
-                    for m2 in c[i + 1 :]:
-                        if (
-                            m2 in mention_to_gold
-                            and mention_to_gold[m] == mention_to_gold[m2]
-                        ):
-                            common_links += 1
-
-        num += len(c) * common_links / float(all_links)
-        den += len(c)
-
-    return num, den
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 76e790896dd..0f06735b382 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -24,7 +24,7 @@
     doc2clusters,
 )
 
-from ..coref_scorer import Evaluator, get_cluster_info, lea
+from ..scorer import ClusterEvaluator, get_cluster_info, lea
 
 
 default_config = """
@@ -314,7 +314,7 @@ def score(self, examples, **kwargs):
         https://api.semanticscholar.org/CorpusID:17606580
         """
 
-        evaluator = Evaluator(lea)
+        evaluator = ClusterEvaluator(lea)
 
         for ex in examples:
             p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 4856bfc0dc2..3b199e50235 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1116,3 +1116,129 @@ def _auc(x, y):
         # regular numpy.ndarray instances.
         area = area.dtype.type(area)
     return area
+
+
+def get_cluster_info(predicted_clusters, gold_clusters):
+    p2g = get_markable_assignments(predicted_clusters, gold_clusters)
+    g2p = get_markable_assignments(gold_clusters, predicted_clusters)
+    # this is the data format used as input by the evaluator
+    return (gold_clusters, predicted_clusters, g2p, p2g)
+
+
+def get_markable_assignments(in_clusters, out_clusters):
+    markable_cluster_ids = {}
+    out_dic = {}
+    for cluster_id, cluster in enumerate(out_clusters):
+        for m in cluster:
+            out_dic[m] = cluster_id
+
+    for cluster in in_clusters:
+        for im in cluster:
+            for om in out_dic:
+                if im == om:
+                    markable_cluster_ids[im] = out_dic[om]
+                    break
+
+    return markable_cluster_ids
+
+class ClusterEvaluator:
+    def __init__(self, metric, beta=1, keep_aggregated_values=False):
+        self.p_num = 0
+        self.p_den = 0
+        self.r_num = 0
+        self.r_den = 0
+        self.metric = metric
+        self.beta = beta
+        self.keep_aggregated_values = keep_aggregated_values
+
+        if keep_aggregated_values:
+            self.aggregated_p_num = []
+            self.aggregated_p_den = []
+            self.aggregated_r_num = []
+            self.aggregated_r_den = []
+
+    def update(self, coref_info):
+        (
+            key_clusters,
+            sys_clusters,
+            key_mention_sys_cluster,
+            sys_mention_key_cluster,
+        ) = coref_info
+
+        pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster)
+        rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster)
+        self.p_num += pn
+        self.p_den += pd
+        self.r_num += rn
+        self.r_den += rd
+
+        if self.keep_aggregated_values:
+            self.aggregated_p_num.append(pn)
+            self.aggregated_p_den.append(pd)
+            self.aggregated_r_num.append(rn)
+            self.aggregated_r_den.append(rd)
+
+    def f1(self, p_num, p_den, r_num, r_den, beta=1):
+        p = 0 if p_den == 0 else p_num / float(p_den)
+        r = 0 if r_den == 0 else r_num / float(r_den)
+        return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
+
+    def get_f1(self):
+        return self.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=self.beta)
+
+    def get_recall(self):
+        return 0 if self.r_num == 0 else self.r_num / float(self.r_den)
+
+    def get_precision(self):
+        return 0 if self.p_num == 0 else self.p_num / float(self.p_den)
+
+    def get_prf(self):
+        return self.get_precision(), self.get_recall(), self.get_f1()
+
+    def get_counts(self):
+        return self.p_num, self.p_den, self.r_num, self.r_den
+
+    def get_aggregated_values(self):
+        return (
+            self.aggregated_p_num,
+            self.aggregated_p_den,
+            self.aggregated_r_num,
+            self.aggregated_r_den,
+        )
+
+
+def lea(input_clusters, output_clusters, mention_to_gold):
+    """
+    LEA is a metric for scoring coref clusters design to avoid pitfals of prior
+    methods. Proposed in Moosavi and Strube 2016.
+
+    https://api.semanticscholar.org/CorpusID:17606580
+    """
+    num, den = 0, 0
+
+    for c in input_clusters:
+        if len(c) == 1:
+            all_links = 1
+            if (
+                c[0] in mention_to_gold
+                and len(output_clusters[mention_to_gold[c[0]]]) == 1
+            ):
+                common_links = 1
+            else:
+                common_links = 0
+        else:
+            common_links = 0
+            all_links = len(c) * (len(c) - 1) / 2.0
+            for i, m in enumerate(c):
+                if m in mention_to_gold:
+                    for m2 in c[i + 1 :]:
+                        if (
+                            m2 in mention_to_gold
+                            and mention_to_gold[m] == mention_to_gold[m2]
+                        ):
+                            common_links += 1
+
+        num += len(c) * common_links / float(all_links)
+        den += len(c)
+
+    return num, den

From 00d349a1588fc4231227167173c98954412e935b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 30 May 2022 18:21:08 +0900
Subject: [PATCH 02/12] Refactor coval code to remove ternary expressions

---
 spacy/scorer.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index 3b199e50235..af01c99dd13 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1179,18 +1179,32 @@ def update(self, coref_info):
             self.aggregated_r_den.append(rd)
 
     def f1(self, p_num, p_den, r_num, r_den, beta=1):
-        p = 0 if p_den == 0 else p_num / float(p_den)
-        r = 0 if r_den == 0 else r_num / float(r_den)
-        return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
+        p = 0
+        if p_den != 0:
+            p = p_num / float(p_den)
+        r = 0
+        if r_den != 0:
+            r = r_num / float(r_den)
+
+        if p + r == 0:
+            return 0
+
+        return (1 + beta * beta) * p * r / (beta * beta * p + r)
 
     def get_f1(self):
         return self.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=self.beta)
 
     def get_recall(self):
-        return 0 if self.r_num == 0 else self.r_num / float(self.r_den)
+        if self.r_num == 0:
+            return 0
+
+        return self.r_num / float(self.r_den)
 
     def get_precision(self):
-        return 0 if self.p_num == 0 else self.p_num / float(self.p_den)
+        if self.p_num == 0:
+            return 0
+
+        return self.p_num / float(self.p_den)
 
     def get_prf(self):
         return self.get_precision(), self.get_recall(), self.get_f1()

From aaa09f048d49356c08f34406a0cbb0d45047ab8d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 30 May 2022 18:21:37 +0900
Subject: [PATCH 03/12] Black formatting

---
 spacy/scorer.py | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index af01c99dd13..4615376621a 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -482,11 +482,25 @@ def score_clusters(
                 for span1 in gold_cluster:
                     for span2 in gold_cluster:
                         # only record pairs where span1 comes before span2
-                        if (span1.start < span2.start) or (span1.start == span2.start and span1.end < span2.end):
+                        if (span1.start < span2.start) or (
+                            span1.start == span2.start and span1.end < span2.end
+                        ):
                             if include_label:
-                                gold_rel = (span1.label_, span1.start, span1.end - 1, span2.label_, span2.start, span2.end - 1)
+                                gold_rel = (
+                                    span1.label_,
+                                    span1.start,
+                                    span1.end - 1,
+                                    span2.label_,
+                                    span2.start,
+                                    span2.end - 1,
+                                )
                             else:
-                                gold_rel = (span1.start, span1.end - 1, span2.start, span2.end - 1)
+                                gold_rel = (
+                                    span1.start,
+                                    span1.end - 1,
+                                    span2.start,
+                                    span2.end - 1,
+                                )
                             gold_instances.add(gold_rel)
                             if span1.label_ == span2.label_:
                                 gold_per_type[span1.label_].add(gold_rel)
@@ -495,11 +509,25 @@ def score_clusters(
             for pred_cluster in pred_clusters:
                 for span1 in pred_cluster:
                     for span2 in pred_cluster:
-                        if (span1.start < span2.start) or (span1.start == span2.start and span1.end < span2.end):
+                        if (span1.start < span2.start) or (
+                            span1.start == span2.start and span1.end < span2.end
+                        ):
                             if include_label:
-                                pred_rel = (span1.label_, span1.start, span1.end - 1, span2.label_, span2.start, span2.end - 1)
+                                pred_rel = (
+                                    span1.label_,
+                                    span1.start,
+                                    span1.end - 1,
+                                    span2.label_,
+                                    span2.start,
+                                    span2.end - 1,
+                                )
                             else:
-                                pred_rel = (span1.start, span1.end - 1, span2.start, span2.end - 1)
+                                pred_rel = (
+                                    span1.start,
+                                    span1.end - 1,
+                                    span2.start,
+                                    span2.end - 1,
+                                )
                             pred_instances.add(pred_rel)
                             if span1.label_ == span2.label_:
                                 pred_per_type[span1.label_].add(pred_rel)
@@ -515,7 +543,6 @@ def score_clusters(
             f"{attr}_p": None,
             f"{attr}_r": None,
             f"{attr}_f": None,
-
         }
         if include_label:
             final_scores[f"{attr}_per_type"] = None
@@ -1141,6 +1168,7 @@ def get_markable_assignments(in_clusters, out_clusters):
 
     return markable_cluster_ids
 
+
 class ClusterEvaluator:
     def __init__(self, metric, beta=1, keep_aggregated_values=False):
         self.p_num = 0

From d9b601e2689b9e49292fe47845d7719a95fd4c2c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 30 May 2022 19:48:56 +0900
Subject: [PATCH 04/12] Add header

---
 spacy/scorer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index f21c635d579..6c151ed2483 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1145,6 +1145,11 @@ def _auc(x, y):
     return area
 
 
+# The following implementations of get_cluster_info(), get_markable_assignments,
+# and ClusterEvaluator are adapted from coval, which is distributed under the
+# MIT License.
+# Copyright 2018 Nafise Sadat Moosavi
+# See licenses/3rd_party_licenses.txt
 def get_cluster_info(predicted_clusters, gold_clusters):
     p2g = get_markable_assignments(predicted_clusters, gold_clusters)
     g2p = get_markable_assignments(gold_clusters, predicted_clusters)

From 0b9c275072b9eeb722d23fa86e25d66a384ccce5 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 31 May 2022 18:59:28 +0900
Subject: [PATCH 05/12] Make scorers into registered scorers

---
 spacy/ml/models/coref.py         |  4 +-
 spacy/ml/models/coref_util.py    | 17 ------
 spacy/pipeline/coref.py          | 49 +++++++-----------
 spacy/pipeline/span_predictor.py | 61 +++++++++-------------
 spacy/scorer.py                  | 88 +++++++++++++++++++++++++++++---
 5 files changed, 127 insertions(+), 92 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 96fad801982..8eb37097c16 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -56,7 +56,9 @@ def convert_coref_clusterer_inputs(model: Model, X: List[Floats2d], is_train: bo
     def backprop(args: ArgsKwargs) -> List[Floats2d]:
         # convert to xp and wrap in list
         gradients = torch2xp(args.args[0])
-        assert isinstance(gradients, Floats2d)
+        #TODO why did this change? This was fine before merging master.
+        # It's still 2d but this assert fails.
+        #assert isinstance(gradients, Floats2d)
         return [gradients]
 
     return ArgsKwargs(args=(word_features,), kwargs={}), backprop
diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index dc9366a613a..a004a69d73c 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -32,23 +32,6 @@ def get_sentence_ids(doc):
     return out
 
 
-def doc2clusters(doc: Doc, prefix=DEFAULT_CLUSTER_PREFIX) -> MentionClusters:
-    """Given a doc, give the mention clusters.
-
-    This is useful for scoring.
-    """
-    out = []
-    for name, val in doc.spans.items():
-        if not name.startswith(prefix):
-            continue
-
-        cluster = []
-        for mention in val:
-            cluster.append((mention.start, mention.end))
-        out.append(cluster)
-    return out
-
-
 # from model.py, refactored to be non-member
 def get_predicted_antecedents(xp, antecedent_idx, antecedent_scores):
     """Get the ID of the antecedent for each span. -1 if no antecedent."""
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 14fe56ba308..7857bfea8b3 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -13,6 +13,7 @@
 from ..errors import Errors
 from ..tokens import Doc
 from ..vocab import Vocab
+from ..util import registry
 
 from ..ml.models.coref_util import (
     create_gold_scores,
@@ -21,10 +22,9 @@
     get_clusters_from_doc,
     get_predicted_clusters,
     DEFAULT_CLUSTER_PREFIX,
-    doc2clusters,
 )
 
-from ..scorer import ClusterEvaluator, get_cluster_info, lea
+from ..scorer import Scorer
 
 
 default_config = """
@@ -56,7 +56,14 @@
 """
 DEFAULT_COREF_MODEL = Config().from_str(default_config)["model"]
 
-DEFAULT_CLUSTERS_PREFIX = "coref_clusters"
+
+def coref_scorer(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    return Scorer.score_coref_clusters(examples, **kwargs)
+
+
+@registry.scorers("spacy.coref_scorer.v1")
+def make_coref_scorer():
+    return coref_scorer
 
 
 @Language.factory(
@@ -66,6 +73,7 @@
     default_config={
         "model": DEFAULT_COREF_MODEL,
         "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
+        "scorer": {"@scorers": "spacy.coref_scorer.v1"},
     },
     default_score_weights={"coref_f": 1.0, "coref_p": None, "coref_r": None},
 )
@@ -73,12 +81,13 @@ def make_coref(
     nlp: Language,
     name: str,
     model,
-    span_cluster_prefix: str = "coref",
+    scorer: Optional[Callable],
+    span_cluster_prefix: str,
 ) -> "CoreferenceResolver":
     """Create a CoreferenceResolver component."""
 
     return CoreferenceResolver(
-        nlp.vocab, model, name, span_cluster_prefix=span_cluster_prefix
+        nlp.vocab, model, name, span_cluster_prefix=span_cluster_prefix, scorer=scorer
     )
 
 
@@ -95,7 +104,8 @@ def __init__(
         name: str = "coref",
         *,
         span_mentions: str = "coref_mentions",
-        span_cluster_prefix: str,
+        span_cluster_prefix: str = DEFAULT_CLUSTER_PREFIX,
+        scorer: Optional[Callable] = coref_scorer,
     ) -> None:
         """Initialize a coreference resolution component.
 
@@ -117,7 +127,8 @@ def __init__(
         self.span_cluster_prefix = span_cluster_prefix
         self._rehearsal_model = None
 
-        self.cfg: Dict[str, Any] = {}
+        self.cfg: Dict[str, Any] = {"span_cluster_prefix": span_cluster_prefix}
+        self.scorer = scorer
 
     def predict(self, docs: Iterable[Doc]) -> List[MentionClusters]:
         """Apply the pipeline's model to a batch of docs, without modifying them.
@@ -275,7 +286,6 @@ def get_loss(
             log_marg = ops.softmax(score_matrix + ops.xp.log(top_gscores), axis=1)
         log_norm = ops.softmax(score_matrix, axis=1)
         grad = log_norm - log_marg
-        # gradients.append((grad, cidx))
         loss = float((grad**2).sum())
 
         return loss, grad
@@ -305,26 +315,3 @@ def initialize(
 
         assert len(X) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=X, Y=Y)
-
-    def score(self, examples, **kwargs):
-        """Score a batch of examples using LEA.
-        For details on how LEA works and why to use it see the paper:
-        Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric
-        Moosavi and Strube, 2016
-        https://api.semanticscholar.org/CorpusID:17606580
-        """
-
-        evaluator = ClusterEvaluator(lea)
-
-        for ex in examples:
-            p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
-            g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix)
-            cluster_info = get_cluster_info(p_clusters, g_clusters)
-            evaluator.update(cluster_info)
-
-        score = {
-            "coref_f": evaluator.get_f1(),
-            "coref_p": evaluator.get_precision(),
-            "coref_r": evaluator.get_recall(),
-        }
-        return score
diff --git a/spacy/pipeline/span_predictor.py b/spacy/pipeline/span_predictor.py
index d21a45edbdb..f406c944a7e 100644
--- a/spacy/pipeline/span_predictor.py
+++ b/spacy/pipeline/span_predictor.py
@@ -5,20 +5,19 @@
 from thinc.api import Model, Config, Optimizer, CategoricalCrossentropy
 from thinc.api import set_dropout_rate, to_categorical
 from itertools import islice
-from statistics import mean
 
 from .trainable_pipe import TrainablePipe
 from ..language import Language
 from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors
-from ..scorer import Scorer
+from ..scorer import Scorer, doc2clusters
 from ..tokens import Doc
 from ..vocab import Vocab
+from ..util import registry
 
 from ..ml.models.coref_util import (
     MentionClusters,
     DEFAULT_CLUSTER_PREFIX,
-    doc2clusters,
 )
 
 default_span_predictor_config = """
@@ -51,6 +50,15 @@
 DEFAULT_SPAN_PREDICTOR_MODEL = Config().from_str(default_span_predictor_config)["model"]
 
 
+def span_predictor_scorer(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    return Scorer.score_span_predictions(examples, **kwargs)
+
+
+@registry.scorers("spacy.span_predictor_scorer.v1")
+def make_span_predictor_scorer():
+    return span_predictor_scorer
+
+
 @Language.factory(
     "span_predictor",
     assigns=["doc.spans"],
@@ -59,6 +67,7 @@
         "model": DEFAULT_SPAN_PREDICTOR_MODEL,
         "input_prefix": "coref_head_clusters",
         "output_prefix": "coref_clusters",
+        "scorer": {"@scorers": "spacy.span_predictor_scorer.v1"},
     },
     default_score_weights={"span_accuracy": 1.0},
 )
@@ -68,10 +77,16 @@ def make_span_predictor(
     model,
     input_prefix: str = "coref_head_clusters",
     output_prefix: str = "coref_clusters",
+    scorer: Optional[Callable] = span_predictor_scorer,
 ) -> "SpanPredictor":
     """Create a SpanPredictor component."""
     return SpanPredictor(
-        nlp.vocab, model, name, input_prefix=input_prefix, output_prefix=output_prefix
+        nlp.vocab,
+        model,
+        name,
+        input_prefix=input_prefix,
+        output_prefix=output_prefix,
+        scorer=scorer,
     )
 
 
@@ -89,6 +104,7 @@ def __init__(
         *,
         input_prefix: str = "coref_head_clusters",
         output_prefix: str = "coref_clusters",
+        scorer: Optional[Callable] = span_predictor_scorer,
     ) -> None:
         self.vocab = vocab
         self.model = model
@@ -96,7 +112,10 @@ def __init__(
         self.input_prefix = input_prefix
         self.output_prefix = output_prefix
 
-        self.cfg: Dict[str, Any] = {}
+        self.scorer = scorer
+        self.cfg: Dict[str, Any] = {
+            "output_prefix": output_prefix,
+        }
 
     def predict(self, docs: Iterable[Doc]) -> List[MentionClusters]:
         # for now pretend there's just one doc
@@ -254,35 +273,3 @@ def initialize(
 
         assert len(X) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=X, Y=Y)
-
-    def score(self, examples, **kwargs):
-        """
-        Evaluate on reconstructing the correct spans around
-        gold heads.
-        """
-        scores = []
-        xp = self.model.ops.xp
-        for eg in examples:
-            starts = []
-            ends = []
-            pred_starts = []
-            pred_ends = []
-            ref = eg.reference
-            pred = eg.predicted
-            for key, gold_sg in ref.spans.items():
-                if key.startswith(self.output_prefix):
-                    pred_sg = pred.spans[key]
-                    for gold_mention, pred_mention in zip(gold_sg, pred_sg):
-                        starts.append(gold_mention.start)
-                        ends.append(gold_mention.end)
-                        pred_starts.append(pred_mention.start)
-                        pred_ends.append(pred_mention.end)
-
-            starts = xp.asarray(starts)
-            ends = xp.asarray(ends)
-            pred_starts = xp.asarray(pred_starts)
-            pred_ends = xp.asarray(pred_ends)
-            correct = (starts == pred_starts) * (ends == pred_ends)
-            accuracy = correct.mean()
-            scores.append(float(accuracy))
-        return {"span_accuracy": mean(scores)}
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 6c151ed2483..8747b5772d2 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -2,6 +2,7 @@
 from typing import TYPE_CHECKING
 import numpy as np
 from collections import defaultdict
+from statistics import mean
 
 from .training import Example
 from .tokens import Token, Doc, Span
@@ -9,6 +10,7 @@
 from .util import get_lang_class, SimpleFrozenList
 from .morphology import Morphology
 
+
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
     from .language import Language  # noqa: F401
@@ -873,6 +875,67 @@ def score_deps(
                 f"{attr}_las_per_type": None,
             }
 
+    @staticmethod
+    def score_coref_clusters(examples: Iterable[Example], **cfg):
+        """Score a batch of examples using LEA.
+
+        For details on how LEA works and why to use it see the paper:
+        Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric
+        Moosavi and Strube, 2016
+        https://api.semanticscholar.org/CorpusID:17606580
+        """
+
+        span_cluster_prefix = cfg["span_cluster_prefix"]
+
+        evaluator = ClusterEvaluator(lea)
+
+        for ex in examples:
+            p_clusters = doc2clusters(ex.predicted, span_cluster_prefix)
+            g_clusters = doc2clusters(ex.reference, span_cluster_prefix)
+            cluster_info = get_cluster_info(p_clusters, g_clusters)
+            evaluator.update(cluster_info)
+
+        score = {
+            "coref_f": evaluator.get_f1(),
+            "coref_p": evaluator.get_precision(),
+            "coref_r": evaluator.get_recall(),
+        }
+        return score
+
+    @staticmethod
+    def score_span_predictions(examples: Iterable[Example], **cfg):
+        """Evaluate reconstruction of the correct spans from gold heads.
+        """
+        scores = []
+        output_prefix = cfg["output_prefix"]
+        for eg in examples:
+            starts = []
+            ends = []
+            pred_starts = []
+            pred_ends = []
+            ref = eg.reference
+            pred = eg.predicted
+            for key, gold_sg in ref.spans.items():
+                if key.startswith(output_prefix):
+                    pred_sg = pred.spans[key]
+                    for gold_mention, pred_mention in zip(gold_sg, pred_sg):
+                        starts.append(gold_mention.start)
+                        ends.append(gold_mention.end)
+                        pred_starts.append(pred_mention.start)
+                        pred_ends.append(pred_mention.end)
+
+
+            # TODO check logic
+            # see how many are perfect
+            cs = [a == b for a, b in zip(starts, pred_starts)]
+            ce = [a == b for a, b in zip(ends, pred_ends)]
+            correct = [int(a and b) for a, b in zip(cs, ce)]
+            accuracy = sum(correct) / len(correct)
+
+            scores.append(float(accuracy))
+        out_key = f"span_{output_prefix}_accuracy"
+        return {out_key: mean(scores)}
+
 
 def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
     """Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
@@ -1255,12 +1318,6 @@ def get_aggregated_values(self):
 
 
 def lea(input_clusters, output_clusters, mention_to_gold):
-    """
-    LEA is a metric for scoring coref clusters design to avoid pitfals of prior
-    methods. Proposed in Moosavi and Strube 2016.
-
-    https://api.semanticscholar.org/CorpusID:17606580
-    """
     num, den = 0, 0
 
     for c in input_clusters:
@@ -1289,3 +1346,22 @@ def lea(input_clusters, output_clusters, mention_to_gold):
         den += len(c)
 
     return num, den
+
+
+# This is coref related, but not from coval.
+# def doc2clusters(doc: Doc, prefix) -> MentionClusters:
+def doc2clusters(doc, prefix):
+    """Given a doc, give the mention clusters.
+
+    This is used for scoring.
+    """
+    out = []
+    for name, val in doc.spans.items():
+        if not name.startswith(prefix):
+            continue
+
+        cluster = []
+        for mention in val:
+            cluster.append((mention.start, mention.end))
+        out.append(cluster)
+    return out

From 8faf168de16461537353cc76fcbbde6841bc34fa Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 31 May 2022 19:29:32 +0900
Subject: [PATCH 06/12] Small test fixes

---
 spacy/tests/pipeline/test_coref.py | 6 +++---
 spacy/tests/test_models.py         | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 25de6e35634..c1048682a9c 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -5,8 +5,8 @@
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
-from spacy.pipeline.coref import DEFAULT_CLUSTERS_PREFIX
 from spacy.ml.models.coref_util import (
+    DEFAULT_CLUSTER_PREFIX,
     select_non_crossing_spans,
     get_sentence_ids,
 )
@@ -17,12 +17,12 @@
         "Yes, I noticed that many friends around me received it. It seems that almost everyone received this SMS.",
         {
             "spans": {
-                f"{DEFAULT_CLUSTERS_PREFIX}_1": [
+                f"{DEFAULT_CLUSTER_PREFIX}_1": [
                     (5, 6, "MENTION"),      # I
                     (40, 42, "MENTION"),    # me
 
                 ],
-                f"{DEFAULT_CLUSTERS_PREFIX}_2": [
+                f"{DEFAULT_CLUSTER_PREFIX}_2": [
                     (52, 54, "MENTION"),     # it
                     (95, 103, "MENTION"),    # this SMS
                 ]
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 794f9ca8797..b3ce46e3405 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -7,8 +7,9 @@
 import numpy
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
 from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
+from spacy.ml.models import build_spancat_model
 if has_torch:
-    from spacy.ml.models import build_spancat_model, build_wl_coref_model
+    from spacy.ml.models import build_wl_coref_model, build_span_predictor
 from spacy.ml.staticvectors import StaticVectors
 from spacy.ml.extract_spans import extract_spans, _get_span_indices
 from spacy.lang.en import English

From f2096ff6d3f79c565b2d7339006d7bb30fb2f800 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 3 Jun 2022 14:07:47 +0900
Subject: [PATCH 07/12] Skip coref tests when torch not present

Coref can't be loaded without Torch, so nothing works.
---
 spacy/tests/pipeline/test_coref.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index c1048682a9c..efa68cc3c95 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -11,6 +11,8 @@
     get_sentence_ids,
 )
 
+from thinc.util import has_torch
+
 # fmt: off
 TRAIN_DATA = [
     (
@@ -45,18 +47,20 @@ def snlp():
     return en
 
 
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_add_pipe(nlp):
     nlp.add_pipe("coref")
     assert nlp.pipe_names == ["coref"]
 
 
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_not_initialized(nlp):
     nlp.add_pipe("coref")
     text = "She gave me her pen."
     with pytest.raises(ValueError):
         nlp(text)
 
-
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_initialized(nlp):
     nlp.add_pipe("coref")
     nlp.initialize()
@@ -68,6 +72,7 @@ def test_initialized(nlp):
         assert len(v) <= 15
 
 
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_initialized_short(nlp):
     nlp.add_pipe("coref")
     nlp.initialize()
@@ -77,6 +82,7 @@ def test_initialized_short(nlp):
     print(doc.spans)
 
 
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_coref_serialization(nlp):
     # Test that the coref component can be serialized
     nlp.add_pipe("coref", last=True)
@@ -101,6 +107,7 @@ def test_coref_serialization(nlp):
         # assert spans_result == spans_result2
 
 
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_overfitting_IO(nlp):
     # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly
     train_examples = []
@@ -147,6 +154,7 @@ def test_overfitting_IO(nlp):
     # assert_equal(batch_deps_1, no_batch_deps)
 
 
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_crossing_spans():
     starts = [6, 10, 0, 1, 0, 1, 0, 1, 2, 2, 2]
     ends = [12, 12, 2, 3, 3, 4, 4, 4, 3, 4, 5]
@@ -158,6 +166,7 @@ def test_crossing_spans():
     guess = sorted(guess)
     assert gold == guess
 
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_sentence_map(snlp):
     doc = snlp("I like text. This is text.")
     sm = get_sentence_ids(doc)

From 028a23a0c5818cfaa0be731686cc5324c8be49a2 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 3 Jun 2022 16:10:48 +0900
Subject: [PATCH 08/12] Fix remaining type issues

Some of this just involves ignoring types in thorny areas. Two main
issues:

1. Some things have weird types due to indirection/ argskwargs
2. xp2torch return type seems to have changed at some point
---
 spacy/ml/models/coref.py          |  3 ++-
 spacy/ml/models/span_predictor.py | 15 ++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 8eb37097c16..976195a760d 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -53,7 +53,8 @@ def convert_coref_clusterer_inputs(model: Model, X: List[Floats2d], is_train: bo
     X = X[0]
     word_features = xp2torch(X, requires_grad=is_train)
 
-    def backprop(args: ArgsKwargs) -> List[Floats2d]:
+    # TODO fix or remove type annotations
+    def backprop(args: ArgsKwargs): #-> List[Floats2d]:
         # convert to xp and wrap in list
         gradients = torch2xp(args.args[0])
         #TODO why did this change? This was fine before merging master.
diff --git a/spacy/ml/models/span_predictor.py b/spacy/ml/models/span_predictor.py
index a8c4d1aaa7e..4074de057dc 100644
--- a/spacy/ml/models/span_predictor.py
+++ b/spacy/ml/models/span_predictor.py
@@ -48,23 +48,24 @@ def build_span_predictor(
 
 
 def convert_span_predictor_inputs(
-    model: Model, X: Tuple[Ints1d, Tuple[Floats2d, Ints1d]], is_train: bool
+    model: Model, X: Tuple[List[Floats2d], Tuple[List[Ints1d], List[Ints1d]]], is_train: bool
 ):
     tok2vec, (sent_ids, head_ids) = X
     # Normally we should use the input is_train, but for these two it's not relevant
-
-    def backprop(args: ArgsKwargs) -> List[Floats2d]:
+    # TODO fix the type here, or remove it
+    def backprop(args: ArgsKwargs): #-> Tuple[List[Floats2d], None]:
         gradients = torch2xp(args.args[1])
+        # The sent_ids and head_ids are None because no gradients
         return [[gradients], None]
 
     word_features = xp2torch(tok2vec[0], requires_grad=is_train)
-    sent_ids = xp2torch(sent_ids[0], requires_grad=False)
+    sent_ids_tensor = xp2torch(sent_ids[0], requires_grad=False)
     if not head_ids[0].size:
-        head_ids = torch.empty(size=(0,))
+        head_ids_tensor = torch.empty(size=(0,))
     else:
-        head_ids = xp2torch(head_ids[0], requires_grad=False)
+        head_ids_tensor = xp2torch(head_ids[0], requires_grad=False)
 
-    argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={})
+    argskwargs = ArgsKwargs(args=(sent_ids_tensor, word_features, head_ids_tensor), kwargs={})
     return argskwargs, backprop
 
 

From aeb84255d1a638b0f4663c6972fc0e3cf999cafb Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 21 Jun 2022 19:26:02 +0900
Subject: [PATCH 09/12] Update spacy/scorer.py

Co-authored-by: kadarakos <kadar.akos@gmail.com>
---
 spacy/scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index 8747b5772d2..69d2a99ff5f 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1350,7 +1350,7 @@ def lea(input_clusters, output_clusters, mention_to_gold):
 
 # This is coref related, but not from coval.
 # def doc2clusters(doc: Doc, prefix) -> MentionClusters:
-def doc2clusters(doc, prefix):
+def doc2clusters(doc: Doc, prefix: str) -> List[Tuple[int, int]]:
     """Given a doc, give the mention clusters.
 
     This is used for scoring.

From 63473c972de97543899cbfbe4c2d0041b8f21628 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 21 Jun 2022 19:52:23 +0900
Subject: [PATCH 10/12] Small changes from review

---
 spacy/ml/models/coref.py           | 3 ---
 spacy/scorer.py                    | 2 --
 spacy/tests/pipeline/test_coref.py | 1 -
 3 files changed, 6 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 976195a760d..74f5d99ae2c 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -57,9 +57,6 @@ def convert_coref_clusterer_inputs(model: Model, X: List[Floats2d], is_train: bo
     def backprop(args: ArgsKwargs): #-> List[Floats2d]:
         # convert to xp and wrap in list
         gradients = torch2xp(args.args[0])
-        #TODO why did this change? This was fine before merging master.
-        # It's still 2d but this assert fails.
-        #assert isinstance(gradients, Floats2d)
         return [gradients]
 
     return ArgsKwargs(args=(word_features,), kwargs={}), backprop
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 69d2a99ff5f..8f54cc8b489 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -925,7 +925,6 @@ def score_span_predictions(examples: Iterable[Example], **cfg):
                         pred_ends.append(pred_mention.end)
 
 
-            # TODO check logic
             # see how many are perfect
             cs = [a == b for a, b in zip(starts, pred_starts)]
             ce = [a == b for a, b in zip(ends, pred_ends)]
@@ -1349,7 +1348,6 @@ def lea(input_clusters, output_clusters, mention_to_gold):
 
 
 # This is coref related, but not from coval.
-# def doc2clusters(doc: Doc, prefix) -> MentionClusters:
 def doc2clusters(doc: Doc, prefix: str) -> List[Tuple[int, int]]:
     """Given a doc, give the mention clusters.
 
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index efa68cc3c95..3ecc9475ffc 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -79,7 +79,6 @@ def test_initialized_short(nlp):
     assert nlp.pipe_names == ["coref"]
     text = "Hi there"
     doc = nlp(text)
-    print(doc.spans)
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")

From 14fd2498c318c53554358f35919c31bffb92dfeb Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 21 Jun 2022 19:51:35 +0900
Subject: [PATCH 11/12] Be specific about the ValueError

---
 spacy/tests/pipeline/test_coref.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 3ecc9475ffc..53f0b201169 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -57,7 +57,7 @@ def test_add_pipe(nlp):
 def test_not_initialized(nlp):
     nlp.add_pipe("coref")
     text = "She gave me her pen."
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="E109"):
         nlp(text)
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")

From 1b21a25f73d2c866f29f19c1419006e126441137 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 22 Jun 2022 14:54:48 +0900
Subject: [PATCH 12/12] Type fix

---
 spacy/scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index 8f54cc8b489..14b4b2a7956 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1348,7 +1348,7 @@ def lea(input_clusters, output_clusters, mention_to_gold):
 
 
 # This is coref related, but not from coval.
-def doc2clusters(doc: Doc, prefix: str) -> List[Tuple[int, int]]:
+def doc2clusters(doc: Doc, prefix: str) -> List[List[Tuple[int, int]]]:
     """Given a doc, give the mention clusters.
 
     This is used for scoring.