Store activations in Docs when save_activations is enabled (explo…

…sion#11002) * Store activations in Doc when `store_activations` is enabled This change adds the new `activations` attribute to `Doc`. This attribute can be used by trainable pipes to store their activations, probabilities, and guesses for downstream users. As an example, this change modifies the `tagger` and `senter` pipes to add an `store_activations` option. When this option is enabled, the probabilities and guesses are stored in `set_annotations`. * Change type of `store_activations` to `Union[bool, List[str]]` When the value is: - A bool: all activations are stored when set to `True`. - A List[str]: the activations named in the list are stored * Formatting fixes in Tagger * Support store_activations in spancat and morphologizer * Make Doc.activations type visible to MyPy * textcat/textcat_multilabel: add store_activations option * trainable_lemmatizer/entity_linker: add store_activations option * parser/ner: do not currently support returning activations * Extend tagger and senter tests So that they, like the other tests, also check that we get no activations if no activations were requested. * Document `Doc.activations` and `store_activations` in the relevant pipes * Start errors/warnings at higher numbers to avoid merge conflicts Between the master and v4 branches. * Add `store_activations` to docstrings. * Replace store_activations setter by set_store_activations method Setters that take a different type than what the getter returns are still problematic for MyPy. Replace the setter by a method, so that type inference works everywhere. * Use dict comprehension suggested by @svlandeg * Revert "Use dict comprehension suggested by @svlandeg" This reverts commit 6e7b958. * EntityLinker: add type annotations to _add_activations * _store_activations: make kwarg-only, remove doc_scores_lens arg * set_annotations: add type annotations * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * TextCat.predict: return dict * Make the `TrainablePipe.store_activations` property a bool This means that we can also bring back `store_activations` setter. * Remove `TrainablePipe.activations` We do not need to enumerate the activations anymore since `store_activations` is `bool`. * Add type annotations for activations in predict/set_annotations * Rename `TrainablePipe.store_activations` to `save_activations` * Error E1400 is not used anymore This error was used when activations were still `Union[bool, List[str]]`. * Change wording in API docs after store -> save change * docs: tag (save_)activations as new in spaCy 4.0 * Fix copied line in morphologizer activations test * Don't train in any test_save_activations test * Rename activations - "probs" -> "probabilities" - "guesses" -> "label_ids", except in the edit tree lemmatizer, where "guesses" -> "tree_ids". * Remove unused W400 warning. This warning was used when we still allowed the user to specify which activations to save. * Formatting fixes Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Replace "kb_ids" by a constant * spancat: replace a cast by an assertion * Fix EOF spacing * Fix comments in test_save_activations tests * Do not set RNG seed in activation saving tests * Revert "spancat: replace a cast by an assertion" This reverts commit 0bd5730. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
jordankanter · May 10, 2024 · 1233907 · 1233907
1 parent 3abfa8d
commit 1233907
Show file tree

Hide file tree

Showing 19 changed files with 202 additions and 755 deletions.
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,10 +1,10 @@
 from collections import Counter
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
@@ -18,6 +18,10 @@
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 
+# The cutoff value of *top_k* above which an alternative method is used to process guesses.
+TOP_K_GUARDRAIL = 20
+
+
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 
@@ -50,7 +54,6 @@
         "top_k": 1,
         "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
         "save_activations": False,
-        "save_activations": False,
     },
     default_score_weights={"lemma_acc": 1.0},
 )
@@ -64,7 +67,6 @@ def make_edit_tree_lemmatizer(
     top_k: int,
     scorer: Optional[Callable],
     save_activations: bool,
-    save_activations: bool,
 ):
     """Construct an EditTreeLemmatizer component."""
     return EditTreeLemmatizer(
@@ -77,7 +79,6 @@ def make_edit_tree_lemmatizer(
         top_k=top_k,
         scorer=scorer,
         save_activations=save_activations,
-        save_activations=save_activations,
     )
 
 
@@ -98,7 +99,6 @@ def __init__(
         top_k: int = 1,
         scorer: Optional[Callable] = lemmatizer_score,
         save_activations: bool = False,
-        save_activations: bool = False,
     ):
         """
         Construct an edit tree lemmatizer.
@@ -111,7 +111,6 @@ def __init__(
         overwrite (bool): overwrite existing lemma annotations.
         top_k (int): try to apply at most the k most probable edit trees.
         save_activations (bool): save model activations in Doc when annotating.
-        save_activations (bool): save model activations in Doc when annotating.
         """
         self.vocab = vocab
         self.model = model
@@ -127,7 +126,6 @@ def __init__(
         self.cfg: Dict[str, Any] = {"labels": []}
         self.scorer = scorer
         self.save_activations = save_activations
-        self.save_activations = save_activations
 
     def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
@@ -156,25 +154,6 @@ def get_loss(
 
         return float(loss), d_scores
 
-    def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
-    ) -> Tuple[float, List[Floats2d]]:
-        """Calculate the loss and its gradient for a batch of student
-        scores, relative to teacher scores.
-
-        teacher_scores: Scores representing the teacher model's predictions.
-        student_scores: Scores representing the student model's predictions.
-
-        RETURNS (Tuple[float, float]): The loss and the gradient.
-
-        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
-        """
-        loss_func = SequenceCategoricalCrossentropy(normalize=False)
-        d_scores, loss = loss_func(student_scores, teacher_scores)
-        if self.model.ops.xp.isnan(loss):
-            raise ValueError(Errors.E910.format(name=self.name))
-        return float(loss), d_scores
-
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
@@ -186,21 +165,13 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
             scores: List[Floats2d] = [
                 self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
             ]
-            guesses: List[Ints1d] = [
-                self.model.ops.alloc((0,), dtype="i") for doc in docs
-            ]
-            scores: List[Floats2d] = [
-                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
-            ]
             assert len(guesses) == n_docs
             return {"probabilities": scores, "tree_ids": guesses}
-            return {"probabilities": scores, "tree_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == n_docs
         guesses = scores2guesses(docs, scores)
         assert len(guesses) == n_docs
         return {"probabilities": scores, "tree_ids": guesses}
-        return {"probabilities": scores, "tree_ids": guesses}
 
     def _scores2guesses_top_k_equals_1(self, docs, scores):
         guesses = []
@@ -260,15 +231,9 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
 
         return guesses
 
-    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
-        batch_tree_ids = activations["tree_ids"]
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         batch_tree_ids = activations["tree_ids"]
         for i, doc in enumerate(docs):
-            if self.save_activations:
-                doc.activations[self.name] = {}
-                for act_name, acts in activations.items():
-                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():