From b6dad83b537e3bbbe356611d6d3033332f7f48fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 13 Sep 2022 09:51:12 +0200 Subject: [PATCH] Store activations in `Doc`s when `save_activations` is enabled (#11002) * Store activations in Doc when `store_activations` is enabled This change adds the new `activations` attribute to `Doc`. This attribute can be used by trainable pipes to store their activations, probabilities, and guesses for downstream users. As an example, this change modifies the `tagger` and `senter` pipes to add an `store_activations` option. When this option is enabled, the probabilities and guesses are stored in `set_annotations`. * Change type of `store_activations` to `Union[bool, List[str]]` When the value is: - A bool: all activations are stored when set to `True`. - A List[str]: the activations named in the list are stored * Formatting fixes in Tagger * Support store_activations in spancat and morphologizer * Make Doc.activations type visible to MyPy * textcat/textcat_multilabel: add store_activations option * trainable_lemmatizer/entity_linker: add store_activations option * parser/ner: do not currently support returning activations * Extend tagger and senter tests So that they, like the other tests, also check that we get no activations if no activations were requested. * Document `Doc.activations` and `store_activations` in the relevant pipes * Start errors/warnings at higher numbers to avoid merge conflicts Between the master and v4 branches. * Add `store_activations` to docstrings. * Replace store_activations setter by set_store_activations method Setters that take a different type than what the getter returns are still problematic for MyPy. Replace the setter by a method, so that type inference works everywhere. * Use dict comprehension suggested by @svlandeg * Revert "Use dict comprehension suggested by @svlandeg" This reverts commit 6e7b958f7060397965176c69649e5414f1f24988. * EntityLinker: add type annotations to _add_activations * _store_activations: make kwarg-only, remove doc_scores_lens arg * set_annotations: add type annotations * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem * TextCat.predict: return dict * Make the `TrainablePipe.store_activations` property a bool This means that we can also bring back `store_activations` setter. * Remove `TrainablePipe.activations` We do not need to enumerate the activations anymore since `store_activations` is `bool`. * Add type annotations for activations in predict/set_annotations * Rename `TrainablePipe.store_activations` to `save_activations` * Error E1400 is not used anymore This error was used when activations were still `Union[bool, List[str]]`. * Change wording in API docs after store -> save change * docs: tag (save_)activations as new in spaCy 4.0 * Fix copied line in morphologizer activations test * Don't train in any test_save_activations test * Rename activations - "probs" -> "probabilities" - "guesses" -> "label_ids", except in the edit tree lemmatizer, where "guesses" -> "tree_ids". * Remove unused W400 warning. This warning was used when we still allowed the user to specify which activations to save. * Formatting fixes Co-authored-by: Sofie Van Landeghem * Replace "kb_ids" by a constant * spancat: replace a cast by an assertion * Fix EOF spacing * Fix comments in test_save_activations tests * Do not set RNG seed in activation saving tests * Revert "spancat: replace a cast by an assertion" This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741. Co-authored-by: Sofie Van Landeghem --- spacy/pipeline/edit_tree_lemmatizer.py | 45 ++--- spacy/pipeline/entity_linker.py | 167 +++++++++++------- spacy/pipeline/morphologizer.pyx | 30 ++++ spacy/pipeline/senter.pyx | 31 +++- spacy/pipeline/spancat.py | 37 ++-- spacy/pipeline/tagger.pyx | 34 +++- spacy/pipeline/textcat.py | 4 + spacy/pipeline/textcat_multilabel.py | 18 +- spacy/pipeline/trainable_pipe.pyx | 6 +- .../pipeline/test_edit_tree_lemmatizer.py | 25 +++ spacy/tests/pipeline/test_entity_linker.py | 8 +- spacy/tests/pipeline/test_morphologizer.py | 25 ++- spacy/tests/pipeline/test_senter.py | 25 ++- spacy/tests/pipeline/test_tagger.py | 49 +---- spacy/tests/pipeline/test_textcat.py | 48 +++-- spacy/tokens/doc.pxd | 2 + spacy/tokens/doc.pyi | 3 +- website/docs/api/doc.mdx | 1 + website/docs/api/entitylinker.mdx | 2 +- website/docs/api/morphologizer.mdx | 6 +- 20 files changed, 388 insertions(+), 178 deletions(-) diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 1a29735e8e8..1f16b44cfed 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -4,7 +4,7 @@ import numpy as np import srsly -from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy +from thinc.api import Config, Model, SequenceCategoricalCrossentropy from thinc.types import ArrayXd, Floats2d, Ints1d from .. import util @@ -18,6 +18,10 @@ from .lemmatizer import lemmatizer_score from .trainable_pipe import TrainablePipe +# The cutoff value of *top_k* above which an alternative method is used to process guesses. +TOP_K_GUARDRAIL = 20 + + ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] @@ -50,6 +54,7 @@ "top_k": 1, "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, "save_activations": False, + "save_activations": False, }, default_score_weights={"lemma_acc": 1.0}, ) @@ -63,6 +68,7 @@ def make_edit_tree_lemmatizer( top_k: int, scorer: Optional[Callable], save_activations: bool, + save_activations: bool, ): """Construct an EditTreeLemmatizer component.""" return EditTreeLemmatizer( @@ -75,6 +81,7 @@ def make_edit_tree_lemmatizer( top_k=top_k, scorer=scorer, save_activations=save_activations, + save_activations=save_activations, ) @@ -95,6 +102,7 @@ def __init__( top_k: int = 1, scorer: Optional[Callable] = lemmatizer_score, save_activations: bool = False, + save_activations: bool = False, ): """ Construct an edit tree lemmatizer. @@ -107,6 +115,7 @@ def __init__( overwrite (bool): overwrite existing lemma annotations. top_k (int): try to apply at most the k most probable edit trees. save_activations (bool): save model activations in Doc when annotating. + save_activations (bool): save model activations in Doc when annotating. """ self.vocab = vocab self.model = model @@ -122,6 +131,7 @@ def __init__( self.cfg: Dict[str, Any] = {"labels": []} self.scorer = scorer self.save_activations = save_activations + self.save_activations = save_activations def get_loss( self, examples: Iterable[Example], scores: List[Floats2d] @@ -150,25 +160,6 @@ def get_loss( return float(loss), d_scores - def get_teacher_student_loss( - self, teacher_scores: List[Floats2d], student_scores: List[Floats2d] - ) -> Tuple[float, List[Floats2d]]: - """Calculate the loss and its gradient for a batch of student - scores, relative to teacher scores. - - teacher_scores: Scores representing the teacher model's predictions. - student_scores: Scores representing the student model's predictions. - - RETURNS (Tuple[float, float]): The loss and the gradient. - - DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss - """ - loss_func = SequenceCategoricalCrossentropy(normalize=False) - d_scores, loss = loss_func(student_scores, teacher_scores) - if self.model.ops.xp.isnan(loss): - raise ValueError(Errors.E910.format(name=self.name)) - return float(loss), d_scores - def predict(self, docs: Iterable[Doc]) -> ActivationsT: n_docs = len(list(docs)) if not any(len(doc) for doc in docs): @@ -180,13 +171,21 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: scores: List[Floats2d] = [ self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs ] + guesses: List[Ints1d] = [ + self.model.ops.alloc((0,), dtype="i") for doc in docs + ] + scores: List[Floats2d] = [ + self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs + ] assert len(guesses) == n_docs return {"probabilities": scores, "tree_ids": guesses} + return {"probabilities": scores, "tree_ids": guesses} scores = self.model.predict(docs) assert len(scores) == n_docs guesses = scores2guesses(docs, scores) assert len(guesses) == n_docs return {"probabilities": scores, "tree_ids": guesses} + return {"probabilities": scores, "tree_ids": guesses} def _scores2guesses_top_k_equals_1(self, docs, scores): guesses = [] @@ -246,9 +245,15 @@ def _scores2guesses_top_k_guardrail(self, docs, scores): return guesses + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): + batch_tree_ids = activations["tree_ids"] def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): batch_tree_ids = activations["tree_ids"] for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] if self.save_activations: doc.activations[self.name] = {} for act_name, acts in activations.items(): diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 629a5f193aa..2716d3821e2 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,3 +1,10 @@ +from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any +from typing import cast +from numpy import dtype +from thinc.types import Floats1d, Floats2d, Ints1d, Ragged +from pathlib import Path +from itertools import islice +import srsly import random import warnings from itertools import islice @@ -21,10 +28,14 @@ from .pipe import deserialize_config from .trainable_pipe import TrainablePipe + ActivationsT = Dict[str, Union[List[Ragged], List[str]]] KNOWLEDGE_BASE_IDS = "kb_ids" +# See #9050 +BACKWARD_OVERWRITE = True + default_model_config = """ [model] @architectures = "spacy.EntityLinker.v2" @@ -61,6 +72,7 @@ "candidates_batch_size": 1, "threshold": None, "save_activations": False, + "save_activations": False, }, default_score_weights={ "nel_micro_f": 1.0, @@ -89,6 +101,7 @@ def make_entity_linker( candidates_batch_size: int, threshold: Optional[float] = None, save_activations: bool, + save_activations: bool, ): """Construct an EntityLinker component. @@ -113,6 +126,7 @@ def make_entity_linker( threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. save_activations (bool): save model activations in Doc when annotating. + save_activations (bool): save model activations in Doc when annotating. """ if not model.attrs.get("include_span_maker", False): raise ValueError(Errors.E4005) @@ -135,6 +149,7 @@ def make_entity_linker( candidates_batch_size=candidates_batch_size, threshold=threshold, save_activations=save_activations, + save_activations=save_activations, ) @@ -176,6 +191,7 @@ def __init__( candidates_batch_size: int, threshold: Optional[float] = None, save_activations: bool = False, + save_activations: bool = False, ) -> None: """Initialize an entity linker. @@ -230,6 +246,7 @@ def __init__( self.candidates_batch_size = candidates_batch_size self.threshold = threshold self.save_activations = save_activations + self.save_activations = save_activations if candidates_batch_size < 1: raise ValueError(Errors.E1044) @@ -437,6 +454,7 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): loss = loss / len(entity_encodings) return float(loss), out + def predict(self, docs: Iterable[Doc]) -> ActivationsT: def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. Returns the KB IDs for each entity in each doc, including NIL if there is @@ -454,38 +472,48 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: xp = ops.xp docs_ents: List[Ragged] = [] docs_scores: List[Ragged] = [] + ops = self.model.ops + xp = ops.xp + docs_ents: List[Ragged] = [] + docs_scores: List[Ragged] = [] if not docs: - return { - KNOWLEDGE_BASE_IDS: final_kb_ids, - "ents": docs_ents, - "scores": docs_scores, - } + return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores} if isinstance(docs, Doc): docs = [docs] + for doc in docs: + doc_ents: List[Ints1d] = [] + doc_scores: List[Floats1d] = [] for doc in docs: doc_ents: List[Ints1d] = [] doc_scores: List[Floats1d] = [] if len(doc) == 0: + docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0))) + docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0))) docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0))) docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0))) continue sentences = [s for s in doc.sents] - # Loop over entities in batches. - for ent_idx in range(0, len(doc.ents), self.candidates_batch_size): - ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size] - - # Look up candidate entities. - valid_ent_idx = [ - idx - for idx in range(len(ent_batch)) - if ent_batch[idx].label_ not in self.labels_discard - ] - - batch_candidates = list( - self.get_candidates_batch( - self.kb, - SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]), + if self.incl_context: + # get n_neighbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model.predict([sent_doc])[0] + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + entity_count += 1 + if ent.label_ in self.labels_discard: + # ignoring this entity - setting to NIL + final_kb_ids.append(self.NIL) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=[0.0], + ents=[0], ) else: candidates = list(self.get_candidates(self.kb, ent)) @@ -519,51 +547,17 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: entity_encodings = xp.asarray( [c.entity_vector for c in candidates] ) - elif len(candidates) == 1 and self.threshold is None: - # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_id_) - self._add_activations( - doc_scores=doc_scores, - doc_ents=doc_ents, - scores=[1.0], - ents=[candidates[0].entity_id], - ) - else: - random.shuffle(candidates) - # set all prior probabilities to 0 if incl_prior=False - if self.incl_prior and self.kb.supports_prior_probs: - prior_probs = xp.asarray([c.prior_prob for c in candidates]) # type: ignore - else: - prior_probs = xp.asarray([0.0 for _ in candidates]) - scores = prior_probs - # add in similarity from the context - if self.incl_context: - entity_encodings = xp.asarray( - [c.entity_vector for c in candidates] - ) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError( - Errors.E147.format( - method="predict", - msg="vectors not of equal length", - ) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + if len(entity_encodings) != len(prior_probs): + raise RuntimeError( + Errors.E147.format( + method="predict", + msg="vectors not of equal length", ) ) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs * sims) - final_kb_ids.append( - candidates[scores.argmax().item()].entity_id_ - if self.threshold is None - or scores.max() >= self.threshold - else EntityLinker.NIL - ) - self._add_activations( - doc_scores=doc_scores, - doc_ents=doc_ents, - scores=scores, - ents=[c.entity_id for c in candidates], + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / ( + sentence_norm * entity_norm ) if sims.shape != prior_probs.shape: raise ValueError(Errors.E161) @@ -590,27 +584,35 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: method="predict", msg="result variables not of equal length" ) raise RuntimeError(err) - return { - KNOWLEDGE_BASE_IDS: final_kb_ids, - "ents": docs_ents, - "scores": docs_scores, - } + return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores} + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations (ActivationsT): The activations used for setting annotations, produced by EntityLinker.predict. + activations (ActivationsT): The activations used for setting annotations, produced + by EntityLinker.predict. DOCS: https://spacy.io/api/entitylinker#set_annotations """ kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS]) + kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS]) count_ents = len([ent for doc in docs for ent in doc.ents]) if count_ents != len(kb_ids): raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) i = 0 overwrite = self.cfg["overwrite"] + for j, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + if act_name != KNOWLEDGE_BASE_IDS: + # We only copy activations that are Ragged. + doc.activations[self.name][act_name] = cast(Ragged, acts[j]) + for j, doc in enumerate(docs): if self.save_activations: doc.activations[self.name] = {} @@ -746,3 +748,32 @@ def _add_activations( ops = self.model.ops doc_scores.append(ops.asarray1f(scores)) doc_ents.append(ops.asarray1i(ents, dtype="uint64")) + + def _add_doc_activations( + self, + *, + docs_scores: List[Ragged], + docs_ents: List[Ragged], + doc_scores: List[Floats1d], + doc_ents: List[Ints1d], + ): + if not self.save_activations: + return + ops = self.model.ops + lengths = ops.asarray1i([s.shape[0] for s in doc_scores]) + docs_scores.append(Ragged(ops.flatten(doc_scores), lengths)) + docs_ents.append(Ragged(ops.flatten(doc_ents), lengths)) + + def _add_activations( + self, + *, + doc_scores: List[Floats1d], + doc_ents: List[Ints1d], + scores: Sequence[float], + ents: Sequence[int], + ): + if not self.save_activations: + return + ops = self.model.ops + doc_scores.append(ops.asarray1f(scores)) + doc_ents.append(ops.asarray1i(ents, dtype="uint64")) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 43e36b36844..c26be7912a9 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,4 +1,8 @@ # cython: infer_types=True, profile=True, binding=True +from typing import Callable, Dict, Iterable, List, Optional, Union +import srsly +from thinc.api import SequenceCategoricalCrossentropy, Model, Config +from thinc.types import Floats2d, Ints1d from itertools import islice from typing import Callable, Dict, Iterable, Optional, Union @@ -8,6 +12,12 @@ from ..morphology cimport Morphology from ..tokens.doc cimport Doc from ..vocab cimport Vocab +from ..parts_of_speech import IDS as POS_IDS +from ..symbols import POS +from ..language import Language +from ..errors import Errors +from .pipe import deserialize_config +from .tagger import ActivationsT, Tagger from .. import util from ..errors import Errors from ..language import Language @@ -58,6 +68,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "save_activations": False, }, + default_config={ + "model": DEFAULT_MORPH_MODEL, + "overwrite": True, + "extend": False, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, + "save_activations": False, + }, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( @@ -69,9 +86,12 @@ def make_morphologizer( label_smoothing: float, scorer: Optional[Callable], save_activations: bool, + save_activations: bool, ): return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, save_activations=save_activations) + return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, + save_activations=save_activations) def morphologizer_score(examples, **kwargs): @@ -107,6 +127,7 @@ class Morphologizer(Tagger): extend: bool = False, scorer: Optional[Callable] = morphologizer_score, save_activations: bool = False, + save_activations: bool = False, ): """Initialize a morphologizer. @@ -120,6 +141,7 @@ class Morphologizer(Tagger): Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr_per_feat for the attribute "morph". save_activations (bool): save model activations in Doc when annotating. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/morphologizer#init """ @@ -141,6 +163,7 @@ class Morphologizer(Tagger): self.cfg = dict(sorted(cfg.items())) self.scorer = scorer self.save_activations = save_activations + self.save_activations = save_activations @property def labels(self): @@ -234,15 +257,18 @@ class Morphologizer(Tagger): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict. + activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict. DOCS: https://spacy.io/api/morphologizer#set_annotations """ batch_tag_ids = activations["label_ids"] + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc @@ -253,6 +279,10 @@ class Morphologizer(Tagger): # to allocate a compatible container out of the iterable. labels = tuple(self.labels) for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] if self.save_activations: doc.activations[self.name] = {} for act_name, acts in activations.items(): diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 51670dcf8cf..dd56b4a62e6 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,12 +1,16 @@ # cython: infer_types=True, profile=True, binding=True +from typing import Dict, Iterable, Optional, Callable, List, Union from itertools import islice from typing import Callable, Dict, Iterable, List, Optional, Union -from thinc.api import Config, Model, SequenceCategoricalCrossentropy +import srsly +from thinc.api import Model, SequenceCategoricalCrossentropy, Config +from thinc.types import Floats2d, Ints1d from ..tokens.doc cimport Doc -from .. import util +from .tagger import ActivationsT, Tagger +from ..language import Language from ..errors import Errors from ..language import Language from ..scorer import Scorer @@ -40,6 +44,12 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] "scorer": {"@scorers": "spacy.senter_scorer.v1"}, "save_activations": False, }, + default_config={ + "model": DEFAULT_SENTER_MODEL, + "overwrite": False, + "scorer": {"@scorers": "spacy.senter_scorer.v1"}, + "save_activations": False, + }, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_senter(nlp: Language, @@ -49,6 +59,13 @@ def make_senter(nlp: Language, scorer: Optional[Callable], save_activations: bool): return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations) +def make_senter(nlp: Language, + name: str, + model: Model, + overwrite: bool, + scorer: Optional[Callable], + save_activations: bool): + return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations) def senter_score(examples, **kwargs): @@ -79,6 +96,7 @@ class SentenceRecognizer(Tagger): overwrite=False, scorer=senter_score, save_activations: bool = False, + save_activations: bool = False, ): """Initialize a sentence recognizer. @@ -90,6 +108,7 @@ class SentenceRecognizer(Tagger): scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". save_activations (bool): save model activations in Doc when annotating. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/sentencerecognizer#init """ @@ -100,6 +119,7 @@ class SentenceRecognizer(Tagger): self.cfg = {"overwrite": overwrite} self.scorer = scorer self.save_activations = save_activations + self.save_activations = save_activations @property def labels(self): @@ -117,20 +137,27 @@ class SentenceRecognizer(Tagger): def label_data(self): return None + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict. + activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict. DOCS: https://spacy.io/api/sentencerecognizer#set_annotations """ batch_tag_ids = activations["label_ids"] + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef bint overwrite = self.cfg["overwrite"] for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] if self.save_activations: doc.activations[self.name] = {} for act_name, acts in activations.items(): diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 72fd78f461e..d800a4d484b 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,18 +1,8 @@ -from dataclasses import dataclass -from functools import partial -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Protocol, - Tuple, - Union, - cast, - runtime_checkable, -) +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast +from typing import Union +from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops +from thinc.api import Optimizer +from thinc.types import Ragged, Ints2d, Floats2d, Ints1d import numpy from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate @@ -36,6 +26,9 @@ ActivationsT = Dict[str, Union[Floats2d, Ragged]] +ActivationsT = Dict[str, Union[Floats2d, Ragged]] + + spancat_default_config = """ [model] @architectures = "spacy.SpanCategorizer.v1" @@ -191,6 +184,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester: "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, "save_activations": False, + "save_activations": False, }, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, ) @@ -204,6 +198,7 @@ def make_spancat( threshold: float, max_positive: Optional[int], save_activations: bool, + save_activations: bool, ) -> "SpanCategorizer": """Create a SpanCategorizer component and configure it for multi-label classification to be able to assign multiple labels for each span. @@ -232,6 +227,7 @@ def make_spancat( max_positive (Optional[int]): Maximum number of labels to consider positive per span. Defaults to None, indicating no limit. save_activations (bool): save model activations in Doc when annotating. + save_activations (bool): save model activations in Doc when annotating. """ return SpanCategorizer( nlp.vocab, @@ -311,6 +307,7 @@ def make_spancat_singlelabel( threshold=None, scorer=scorer, save_activations=save_activations, + save_activations=save_activations, ) @@ -374,6 +371,7 @@ def __init__( threshold: Optional[float] = 0.5, scorer: Optional[Callable] = spancat_score, save_activations: bool = False, + save_activations: bool = False, ) -> None: """Initialize the multi-label or multi-class span categorizer. @@ -424,6 +422,7 @@ def __init__( self.name = name self.scorer = scorer self.save_activations = save_activations + self.save_activations = save_activations @property def key(self) -> str: @@ -481,6 +480,7 @@ def label_data(self) -> List[str]: """ return list(self.labels) + def predict(self, docs: Iterable[Doc]) -> ActivationsT: def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. @@ -492,6 +492,8 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: indices = self.suggester(docs, ops=self.model.ops) scores = self.model.predict((docs, indices)) # type: ignore return {"indices": indices, "scores": scores} + scores = self.model.predict((docs, indices)) # type: ignore + return {"indices": indices, "scores": scores} def set_candidates( self, docs: Iterable[Doc], *, candidates_key: str = "candidates" @@ -511,11 +513,13 @@ def set_candidates( for index in candidates.dataXd: doc.spans[candidates_key].append(doc[index[0] : index[1]]) + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of Doc objects, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations: ActivationsT: The activations, produced by SpanCategorizer.predict. + activations: ActivationsT: The activations, produced by SpanCategorizer.predict. DOCS: https://spacy.io/api/spancategorizer#set_annotations """ @@ -524,9 +528,10 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non indices = activations["indices"] assert isinstance(indices, Ragged) scores = cast(Floats2d, activations["scores"]) + offset = 0 for i, doc in enumerate(docs): - indices_i = cast(Ints2d, indices[i].dataXd) + indices_i = indices[i].dataXd if self.save_activations: doc.activations[self.name] = {} doc.activations[self.name]["indices"] = indices_i diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 21c7b3ab0a3..95016072ef3 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -1,4 +1,9 @@ # cython: infer_types=True, profile=True, binding=True +from typing import Callable, Dict, Iterable, List, Optional, Union +import numpy +import srsly +from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config +from thinc.types import Floats2d, Ints1d import warnings from itertools import islice from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -22,8 +27,12 @@ from ..util import registry from .pipe import deserialize_config from .trainable_pipe import TrainablePipe + ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] +# See #9050 +BACKWARD_OVERWRITE = False + default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -51,6 +60,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] "neg_prefix": "!", "save_activations": False, }, + default_config={ + "model": DEFAULT_TAGGER_MODEL, + "overwrite": False, + "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, + "neg_prefix": "!", + "save_activations": False, + }, default_score_weights={"tag_acc": 1.0}, ) def make_tagger( @@ -61,6 +77,7 @@ def make_tagger( scorer: Optional[Callable], neg_prefix: str, save_activations: bool, + save_activations: bool, ): """Construct a part-of-speech tagger component. @@ -71,6 +88,8 @@ def make_tagger( """ return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, save_activations=save_activations) + return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, + save_activations=save_activations) def tagger_score(examples, **kwargs): @@ -97,6 +116,7 @@ class Tagger(TrainablePipe): scorer=tagger_score, neg_prefix="!", save_activations: bool = False, + save_activations: bool = False, ): """Initialize a part-of-speech tagger. @@ -108,6 +128,7 @@ class Tagger(TrainablePipe): scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attribute "tag". save_activations (bool): save model activations in Doc when annotating. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/tagger#init """ @@ -119,6 +140,7 @@ class Tagger(TrainablePipe): self.cfg = dict(sorted(cfg.items())) self.scorer = scorer self.save_activations = save_activations + self.save_activations = save_activations @property def labels(self): @@ -137,6 +159,7 @@ class Tagger(TrainablePipe): """Data about the labels currently added to the component.""" return tuple(self.cfg["labels"]) + def predict(self, docs) -> ActivationsT: def predict(self, docs) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. @@ -151,11 +174,13 @@ class Tagger(TrainablePipe): guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] assert len(guesses) == len(docs) return {"probabilities": guesses, "label_ids": guesses} + return {"probabilities": guesses, "label_ids": guesses} scores = self.model.predict(docs) assert len(scores) == len(docs), (len(scores), len(docs)) guesses = self._scores2guesses(scores) assert len(guesses) == len(docs) return {"probabilities": scores, "label_ids": guesses} + return {"probabilities": scores, "label_ids": guesses} def _scores2guesses(self, scores): guesses = [] @@ -166,21 +191,28 @@ class Tagger(TrainablePipe): guesses.append(doc_guesses) return guesses + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict. + activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict. DOCS: https://spacy.io/api/tagger#set_annotations """ batch_tag_ids = activations["label_ids"] + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef bint overwrite = self.cfg["overwrite"] labels = self.labels for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] if self.save_activations: doc.activations[self.name] = {} for act_name, acts in activations.items(): @@ -270,7 +302,7 @@ class Tagger(TrainablePipe): student_scores: Scores representing the student model's predictions. RETURNS (Tuple[float, float]): The loss and the gradient. - + DOCS: https://spacy.io/api/tagger#get_teacher_student_loss """ loss_func = SequenceCategoricalCrossentropy(normalize=False) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 13841dd7bbb..79a98b9bc5f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,3 +1,7 @@ +from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union +from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config +from thinc.types import Floats2d +import numpy from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 309b9a84443..d38beb441da 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,3 +1,7 @@ +from typing import Iterable, Optional, Dict, List, Callable, Any, Union +from thinc.types import Floats2d +from thinc.api import Model, Config + from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Union @@ -80,6 +84,8 @@ "model": DEFAULT_MULTI_TEXTCAT_MODEL, "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, "save_activations": False, + "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, + "save_activations": False, }, default_score_weights={ "cats_score": 1.0, @@ -101,6 +107,9 @@ def make_multilabel_textcat( threshold: float, scorer: Optional[Callable], save_activations: bool, +) -> "TextCategorizer": + """Create a TextCategorizer component. The text categorizer predicts categories + save_activations: bool, ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered @@ -119,6 +128,12 @@ def make_multilabel_textcat( threshold=threshold, scorer=scorer, save_activations=save_activations, + nlp.vocab, + model, + name, + threshold=threshold, + scorer=scorer, + save_activations=save_activations, ) @@ -151,6 +166,7 @@ def __init__( threshold: float, scorer: Optional[Callable] = textcat_multilabel_score, save_activations: bool = False, + save_activations: bool = False, ) -> None: """Initialize a text categorizer for multi-label classification. @@ -159,7 +175,6 @@ def __init__( name (str): The component instance name, used to add entries to the losses during training. threshold (float): Cutoff to consider a prediction "positive". - scorer (Optional[Callable]): The scoring method. save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/textcategorizer#init @@ -172,6 +187,7 @@ def __init__( self.cfg = dict(cfg) self.scorer = scorer self.save_activations = save_activations + self.save_activations = save_activations @property def support_missing_values(self): diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index 065a6c20d62..b9c297990f9 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -2,10 +2,14 @@ from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple import srsly -from thinc.api import Model, Optimizer, set_dropout_rate +from thinc.api import set_dropout_rate, Model, Optimizer +import warnings from ..tokens.doc cimport Doc +from ..training import validate_examples +from ..errors import Errors, Warnings +from .pipe import Pipe, deserialize_config from .. import util from ..errors import Errors from ..language import Language diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index 7465c844492..e423965bedc 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -1,3 +1,4 @@ +from typing import cast import pickle from typing import cast @@ -10,6 +11,7 @@ from spacy.language import Language from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees from spacy.pipeline.trainable_pipe import TrainablePipe +from spacy.training import Example from spacy.strings import StringStore from spacy.training import Example from spacy.util import make_tempdir @@ -403,3 +405,26 @@ def test_save_activations(): ] assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO) assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,) + + +def test_save_activations(): + nlp = English() + lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer")) + lemmatizer.min_tree_freq = 1 + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.initialize(get_examples=lambda: train_examples) + nO = lemmatizer.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "trainable_lemmatizer" not in doc.activations + + lemmatizer.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["trainable_lemmatizer"].keys()) == [ + "probabilities", + "tree_ids", + ] + assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO) + assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index e44fef2ad25..804332a9ae8 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,8 +1,9 @@ -from typing import Any, Callable, Dict, Iterable, cast +from typing import Callable, Iterable, Dict, Any, cast import pytest from numpy.testing import assert_equal from thinc.types import Ragged +from thinc.types import Ragged from spacy import Language, registry, util from spacy.attrs import ENT_KB_ID @@ -10,8 +11,8 @@ from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase from spacy.lang.en import English from spacy.ml import load_kb -from spacy.ml.models.entity_linker import build_span_maker, get_candidates from spacy.pipeline import EntityLinker, TrainablePipe +from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tests.util import make_tempdir @@ -1292,6 +1293,7 @@ def create_kb(vocab): assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL +def test_save_activations(): def test_save_activations(): nlp = English() vector_length = 3 @@ -1307,7 +1309,7 @@ def create_kb(vocab): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher - mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length) + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 542d14d1516..bf2eea8a94e 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,5 +1,4 @@ from typing import cast - import pytest from numpy.testing import assert_almost_equal, assert_equal from thinc.api import get_current_ops @@ -10,7 +9,7 @@ from spacy.language import Language from spacy.morphology import Morphology from spacy.pipeline import TrainablePipe -from spacy.tests.util import make_tempdir +from spacy.attrs import MORPH from spacy.tokens import Doc from spacy.training import Example @@ -255,3 +254,25 @@ def test_save_activations(): } assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6) assert doc.activations["morphologizer"]["label_ids"].shape == (5,) + + +def test_save_activations(): + nlp = English() + morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer")) + train_examples = [] + for inst in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) + nlp.initialize(get_examples=lambda: train_examples) + + doc = nlp("This is a test.") + assert "morphologizer" not in doc.activations + + morphologizer.save_activations = True + doc = nlp("This is a test.") + assert "morphologizer" in doc.activations + assert set(doc.activations["morphologizer"].keys()) == { + "label_ids", + "probabilities", + } + assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6) + assert doc.activations["morphologizer"]["label_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 51f943898f1..a594c10b04c 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,5 +1,4 @@ from typing import cast - import pytest from numpy.testing import assert_equal @@ -8,6 +7,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.pipeline import TrainablePipe from spacy.tests.util import make_tempdir from spacy.training import Example @@ -133,3 +133,26 @@ def test_save_activations(): assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"} assert doc.activations["senter"]["probabilities"].shape == (5, nO) assert doc.activations["senter"]["label_ids"].shape == (5,) + + +def test_save_activations(): + # Test if activations are correctly added to Doc when requested. + nlp = English() + senter = cast(TrainablePipe, nlp.add_pipe("senter")) + + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + nO = senter.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "senter" not in doc.activations + + senter.save_activations = True + doc = nlp("This is a test.") + assert "senter" in doc.activations + assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"} + assert doc.activations["senter"]["probabilities"].shape == (5, nO) + assert doc.activations["senter"]["label_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 05e814f0733..50cc828a038 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,5 +1,4 @@ from typing import cast - import pytest from numpy.testing import assert_almost_equal, assert_equal from thinc.api import compounding, get_current_ops @@ -9,7 +8,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TrainablePipe -from spacy.training import Example +from thinc.api import compounding from ..util import make_tempdir @@ -240,52 +239,6 @@ def test_overfitting_IO(): assert doc3[0].tag_ != "N" -def test_is_distillable(): - nlp = English() - tagger = nlp.add_pipe("tagger") - assert tagger.is_distillable - - -def test_distill(): - teacher = English() - teacher_tagger = teacher.add_pipe("tagger") - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1])) - - optimizer = teacher.initialize(get_examples=lambda: train_examples) - - for i in range(50): - losses = {} - teacher.update(train_examples, sgd=optimizer, losses=losses) - assert losses["tagger"] < 0.00001 - - student = English() - student_tagger = student.add_pipe("tagger") - student_tagger.min_tree_freq = 1 - student_tagger.initialize( - get_examples=lambda: train_examples, labels=teacher_tagger.label_data - ) - - distill_examples = [ - Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA - ] - - for i in range(50): - losses = {} - student_tagger.distill( - teacher_tagger, distill_examples, sgd=optimizer, losses=losses - ) - assert losses["tagger"] < 0.00001 - - test_text = "I like blue eggs" - doc = student(test_text) - assert doc[0].tag_ == "N" - assert doc[1].tag_ == "V" - assert doc[2].tag_ == "J" - assert doc[3].tag_ == "N" - - def test_save_activations(): # Test if activations are correctly added to Doc when requested. nlp = English() diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 2bba40d1d13..a54bf394608 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,3 +1,4 @@ +from typing import cast import random from typing import cast @@ -13,16 +14,12 @@ from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer, TrainablePipe -from spacy.pipeline.textcat import ( - single_label_bow_config, - single_label_cnn_config, - single_label_default_config, -) -from spacy.pipeline.textcat_multilabel import ( - multi_label_bow_config, - multi_label_cnn_config, - multi_label_default_config, -) +from spacy.pipeline.textcat import single_label_bow_config +from spacy.pipeline.textcat import single_label_cnn_config +from spacy.pipeline.textcat import single_label_default_config +from spacy.pipeline.textcat_multilabel import multi_label_bow_config +from spacy.pipeline.textcat_multilabel import multi_label_cnn_config +from spacy.pipeline.textcat_multilabel import multi_label_default_config from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tokens import Doc, DocBin @@ -304,6 +301,7 @@ def test_issue9904(): examples = get_examples() scores = textcat.predict([eg.predicted for eg in examples])["probabilities"] + scores = textcat.predict([eg.predicted for eg in examples])["probabilities"] loss = textcat.get_loss(examples, scores)[0] loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] @@ -962,9 +960,11 @@ def test_textcat_multi_threshold(): assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 +def test_save_activations(): def test_save_activations(): nlp = English() textcat = cast(TrainablePipe, nlp.add_pipe("textcat")) + textcat = cast(TrainablePipe, nlp.add_pipe("textcat")) train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: @@ -981,6 +981,34 @@ def test_save_activations(): assert doc.activations["textcat"]["probabilities"].shape == (nO,) +def test_save_activations_multi(): + nlp = English() + textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel")) + + train_examples = [] + for text, annotations in TRAIN_DATA_MULTI_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + nlp.initialize(get_examples=lambda: train_examples) + nO = textcat.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "textcat_multilabel" not in doc.activations + + textcat.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"] + assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,) + nO = textcat.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "textcat" not in doc.activations + + textcat.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["textcat"].keys()) == ["probabilities"] + assert doc.activations["textcat"]["probabilities"].shape == (nO,) + + def test_save_activations_multi(): nlp = English() textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel")) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 9fb6a72c87f..fc0404f1423 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -50,6 +50,8 @@ cdef class Doc: cdef public dict activations + cdef public dict activations + cdef public dict user_hooks cdef public dict user_token_hooks cdef public dict user_span_hooks diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 1304a8aae8d..d83aa0e5486 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -15,7 +15,7 @@ from typing import ( import numpy as np from cymem.cymem import Pool -from thinc.types import Floats1d, Floats2d, Ints2d +from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged from .span import Span from .token import Token from .span_groups import SpanGroups @@ -37,6 +37,7 @@ class Doc: spans: SpanGroups max_length: int length: int + sentiment: float activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]] cats: Dict[str, float] user_hooks: Dict[str, Callable[..., Any]] diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index e92c0e833e0..842b2181a81 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -762,6 +762,7 @@ The L2 norm of the document's vector representation. | `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | | `lang` 2.1 | Language of the document's vocabulary. ~~int~~ | | `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ | +| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index a5dd721e37a..074d1706ac7 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters. | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | | `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index 61abe043e77..4660ec312fa 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | | `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | @@ -454,8 +454,8 @@ coarse-grained POS as the feature `POS`. > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels > ``` -| Name | Description | -| ----------- | ------------------------------------------------------ | +| Name | Description | +| ----------- | --------------------------------------------------------- | | **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ | ## Morphologizer.label_data {id="label_data",tag="property",version="3"}