From 7ea8c4aaa54073f1cb5089d1ebb83b2c55cd1b64 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 27 Mar 2024 18:09:51 +0100
Subject: [PATCH] fix EL scorer

---
 spacy/pipeline/entity_linker.py            | 62 +++++++-------
 spacy/tests/pipeline/test_entity_linker.py | 97 ++++++++++++++++++++++
 2 files changed, 130 insertions(+), 29 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 0581d51ab23..2df293379b8 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -235,7 +235,6 @@ def __init__(
         self.cfg: Dict[str, Any] = {"overwrite": overwrite}
         self.distance = CosineDistance(normalize=False)
         self.kb = generate_empty_kb(self.vocab, entity_vector_length)
-        self.scorer = scorer
         self.use_gold_ents = use_gold_ents
         self.candidates_batch_size = candidates_batch_size
         self.threshold = threshold
@@ -243,6 +242,33 @@ def __init__(
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
 
+        def _score_augmented(examples, **kwargs):
+            # Because of how spaCy works, we can't just score immediately, because Language.evaluate
+            # calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
+            if not self.use_gold_ents:
+                return scorer(examples, **kwargs)
+            else:
+                examples = self._augment_examples(examples)
+                docs = self.pipe(
+                    (eg.predicted for eg in examples),
+                )
+                for eg, doc in zip(examples, docs):
+                    eg.predicted = doc
+                return scorer(examples, **kwargs)
+
+        self.scorer = _score_augmented
+
+    def _augment_examples(self, examples: Iterable[Example]) -> Iterable[Example]:
+        """If use_gold_ents is true, set the gold entities to eg.predicted.
+        """
+        new_examples = []
+        for eg in examples:
+            if self.use_gold_ents:
+                ents, _ = eg.get_aligned_ents_and_ner()
+                eg.predicted.ents = ents
+            new_examples.append(eg)
+        return new_examples
+
     def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
         """Define the KB of this pipe by providing a function that will
         create it using this object's vocab."""
@@ -284,13 +310,9 @@ def initialize(
         nO = self.kb.entity_vector_length
         doc_sample = []
         vector_sample = []
-        orig_ents = []
-        for eg in islice(get_examples(), 10):
+        examples = self._augment_examples(islice(get_examples(), 10))
+        for eg in examples:
             doc = eg.x
-            if self.use_gold_ents:
-                orig_ents.append(doc.ents)
-                ents, _ = eg.get_aligned_ents_and_ner()
-                doc.ents = ents
             doc_sample.append(doc)
             vector_sample.append(self.model.ops.alloc1f(nO))
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
@@ -315,10 +337,6 @@ def initialize(
         if not has_annotations:
             # Clean up dummy annotation
             doc.ents = []
-        if self.use_gold_ents:
-            assert len(doc_sample) == len(orig_ents)
-            for doc, orig_ent in zip(doc_sample, orig_ents):
-                doc.ents = orig_ent
 
     def batch_has_learnable_example(self, examples):
         """Check if a batch contains a learnable example.
@@ -360,25 +378,15 @@ def update(
         losses.setdefault(self.name, 0.0)
         if not examples:
             return losses
+        examples = self._augment_examples(examples)
         validate_examples(examples, "EntityLinker.update")
 
-        set_dropout_rate(self.model, drop)
-        docs = [eg.predicted for eg in examples]
-        # save to restore later
-        old_ents = [doc.ents for doc in docs]
-
-        for doc, ex in zip(docs, examples):
-            if self.use_gold_ents:
-                ents, _ = ex.get_aligned_ents_and_ner()
-                doc.ents = ents
-            else:
-                # only keep matching ents
-                doc.ents = ex.get_matching_ents()
-
         # make sure we have something to learn from, if not, short-circuit
         if not self.batch_has_learnable_example(examples):
             return losses
 
+        set_dropout_rate(self.model, drop)
+        docs = [eg.predicted for eg in examples]
         sentence_encodings, bp_context = self.model.begin_update(docs)
 
         loss, d_scores = self.get_loss(
@@ -389,14 +397,10 @@ def update(
             self.finish_update(sgd)
         losses[self.name] += loss
 
-        # now restore the ents
-        assert len(docs) == len(old_ents)
-        for doc, old in zip(docs, old_ents):
-            doc.ents = old
-
         return losses
 
     def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
+        """ Here, we assume that get_loss is called with augmented examples if need be"""
         validate_examples(examples, "EntityLinker.get_loss")
         entity_encodings = []
         eidx = 0  # indices in gold entities to keep
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 23072fdcaa2..cb3b63df40f 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -807,6 +807,103 @@ def create_kb(vocab):
     assert_equal(batch_deps_1, batch_deps_2)
     assert_equal(batch_deps_1, no_batch_deps)
 
+    eval = nlp.evaluate(train_examples)
+    assert "nel_macro_p" in eval
+    assert "nel_macro_r" in eval
+    assert "nel_macro_f" in eval
+    assert "nel_micro_p" in eval
+    assert "nel_micro_r" in eval
+    assert "nel_micro_f" in eval
+    assert "nel_f_per_type" in eval
+    assert "PERSON" in eval["nel_f_per_type"]
+
+    assert eval["nel_macro_f"] > 0
+    assert eval["nel_micro_f"] > 0
+
+
+def test_overfitting_IO_with_ner():
+    # Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly
+    nlp = English()
+    vector_length = 3
+    assert "Q2146908" not in nlp.vocab.strings
+
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB - assign same prior weight to the two russ cochran's
+        # Q2146908 (Russ Cochran): American golfer
+        # Q7381115 (Russ Cochran): publisher
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="Russ Cochran",
+            entities=["Q2146908", "Q7381115"],
+            probabilities=[0.5, 0.5],
+        )
+        return mykb
+
+    # Create the NER and EL components and add them to the pipeline
+    ner = nlp.add_pipe("ner", first=True)
+    entity_linker = nlp.add_pipe("entity_linker", last=True, config={"use_gold_ents": False})
+    entity_linker.set_kb(create_kb)
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for ent in annotations.get("entities"):
+            ner.add_label(ent[2])
+    optimizer = nlp.initialize()
+
+    # train the NER and NEL pipes
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.001
+    assert losses["entity_linker"] < 0.001
+
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
+
+    # test the trained model
+    test_text = "Russ Cochran was a member of a golf team."
+    doc = nlp(test_text)
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "Russ Cochran"
+    assert ents[0].label_ == "PERSON"
+    assert ents[0].kb_id_ == "Q2146908"
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        assert nlp2.pipe_names == nlp.pipe_names
+        doc2 = nlp2(test_text)
+        ents2 = doc2.ents
+        assert len(ents2) == 1
+        assert ents2[0].text == "Russ Cochran"
+        assert ents2[0].label_ == "PERSON"
+        assert ents2[0].kb_id_ == "Q2146908"
+
+    eval = nlp.evaluate(train_examples)
+    print(eval)
+    assert "nel_macro_f" in eval
+    assert "nel_micro_f" in eval
+    assert "ents_f" in eval
+    assert "nel_f_per_type" in eval
+    assert "ents_per_type" in eval
+    assert "PERSON" in eval["nel_f_per_type"]
+    assert "PERSON" in eval["ents_per_type"]
+
+    assert eval["nel_macro_f"] > 0
+    assert eval["nel_micro_f"] > 0
+    assert eval["ents_f"] > 0
+
 
 def test_kb_serialization():
     # Test that the KB can be used in a pipeline with a different vocab