From bc106789618fc5e8eeeffcb5d445d380f3ee08df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 13 Sep 2022 09:51:12 +0200 Subject: [PATCH] Store activations in `Doc`s when `save_activations` is enabled (#11002) * Store activations in Doc when `store_activations` is enabled This change adds the new `activations` attribute to `Doc`. This attribute can be used by trainable pipes to store their activations, probabilities, and guesses for downstream users. As an example, this change modifies the `tagger` and `senter` pipes to add an `store_activations` option. When this option is enabled, the probabilities and guesses are stored in `set_annotations`. * Change type of `store_activations` to `Union[bool, List[str]]` When the value is: - A bool: all activations are stored when set to `True`. - A List[str]: the activations named in the list are stored * Formatting fixes in Tagger * Support store_activations in spancat and morphologizer * Make Doc.activations type visible to MyPy * textcat/textcat_multilabel: add store_activations option * trainable_lemmatizer/entity_linker: add store_activations option * parser/ner: do not currently support returning activations * Extend tagger and senter tests So that they, like the other tests, also check that we get no activations if no activations were requested. * Document `Doc.activations` and `store_activations` in the relevant pipes * Start errors/warnings at higher numbers to avoid merge conflicts Between the master and v4 branches. * Add `store_activations` to docstrings. * Replace store_activations setter by set_store_activations method Setters that take a different type than what the getter returns are still problematic for MyPy. Replace the setter by a method, so that type inference works everywhere. * Use dict comprehension suggested by @svlandeg * Revert "Use dict comprehension suggested by @svlandeg" This reverts commit 6e7b958f7060397965176c69649e5414f1f24988. * EntityLinker: add type annotations to _add_activations * _store_activations: make kwarg-only, remove doc_scores_lens arg * set_annotations: add type annotations * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem * TextCat.predict: return dict * Make the `TrainablePipe.store_activations` property a bool This means that we can also bring back `store_activations` setter. * Remove `TrainablePipe.activations` We do not need to enumerate the activations anymore since `store_activations` is `bool`. * Add type annotations for activations in predict/set_annotations * Rename `TrainablePipe.store_activations` to `save_activations` * Error E1400 is not used anymore This error was used when activations were still `Union[bool, List[str]]`. * Change wording in API docs after store -> save change * docs: tag (save_)activations as new in spaCy 4.0 * Fix copied line in morphologizer activations test * Don't train in any test_save_activations test * Rename activations - "probs" -> "probabilities" - "guesses" -> "label_ids", except in the edit tree lemmatizer, where "guesses" -> "tree_ids". * Remove unused W400 warning. This warning was used when we still allowed the user to specify which activations to save. * Formatting fixes Co-authored-by: Sofie Van Landeghem * Replace "kb_ids" by a constant * spancat: replace a cast by an assertion * Fix EOF spacing * Fix comments in test_save_activations tests * Do not set RNG seed in activation saving tests * Revert "spancat: replace a cast by an assertion" This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741. Co-authored-by: Sofie Van Landeghem --- spacy/pipeline/edit_tree_lemmatizer.py | 46 ++-- spacy/pipeline/entity_linker.py | 244 +++++++++++------- spacy/pipeline/morphologizer.pyx | 37 ++- spacy/pipeline/senter.pyx | 38 ++- spacy/pipeline/spancat.py | 84 +++--- spacy/pipeline/tagger.pyx | 43 ++- spacy/pipeline/textcat.py | 37 ++- spacy/pipeline/textcat_multilabel.py | 23 +- spacy/pipeline/trainable_pipe.pxd | 1 + spacy/pipeline/trainable_pipe.pyx | 14 +- .../pipeline/test_edit_tree_lemmatizer.py | 26 ++ spacy/tests/pipeline/test_entity_linker.py | 78 ++++-- spacy/tests/pipeline/test_morphologizer.py | 26 +- spacy/tests/pipeline/test_senter.py | 25 ++ spacy/tests/pipeline/test_spancat.py | 34 +-- spacy/tests/pipeline/test_tagger.py | 24 +- spacy/tests/pipeline/test_textcat.py | 64 +++-- spacy/tokens/doc.pxd | 2 + spacy/tokens/doc.pyi | 3 +- spacy/tokens/doc.pyx | 1 + website/docs/api/doc.mdx | 33 +-- website/docs/api/edittreelemmatizer.mdx | 17 +- website/docs/api/entitylinker.mdx | 29 +-- website/docs/api/morphologizer.mdx | 18 +- website/docs/api/sentencerecognizer.mdx | 11 +- website/docs/api/spancategorizer.mdx | 35 +-- website/docs/api/tagger.mdx | 14 +- website/docs/api/textcategorizer.mdx | 17 +- 28 files changed, 669 insertions(+), 355 deletions(-) diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 4a6174bc3d8..2ef639cad52 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -4,8 +4,8 @@ import numpy as np import srsly -from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy -from thinc.types import Floats2d, Ints2d +from thinc.api import Config, Model, SequenceCategoricalCrossentropy +from thinc.types import ArrayXd, Floats2d, Ints1d from .. import util from ..errors import Errors @@ -22,6 +22,9 @@ TOP_K_GUARDRAIL = 20 +ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] + + default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -50,6 +53,7 @@ "overwrite": False, "top_k": 1, "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + "save_activations": False, }, default_score_weights={"lemma_acc": 1.0}, ) @@ -62,6 +66,7 @@ def make_edit_tree_lemmatizer( overwrite: bool, top_k: int, scorer: Optional[Callable], + save_activations: bool, ): """Construct an EditTreeLemmatizer component.""" return EditTreeLemmatizer( @@ -73,6 +78,7 @@ def make_edit_tree_lemmatizer( overwrite=overwrite, top_k=top_k, scorer=scorer, + save_activations=save_activations, ) @@ -92,6 +98,7 @@ def __init__( overwrite: bool = False, top_k: int = 1, scorer: Optional[Callable] = lemmatizer_score, + save_activations: bool = False, ): """ Construct an edit tree lemmatizer. @@ -103,6 +110,7 @@ def __init__( frequency in the training data. overwrite (bool): overwrite existing lemma annotations. top_k (int): try to apply at most the k most probable edit trees. + save_activations (bool): save model activations in Doc when annotating. """ self.vocab = vocab self.model = model @@ -117,7 +125,7 @@ def __init__( self.cfg: Dict[str, Any] = {"labels": []} self.scorer = scorer - self.numpy_ops = NumpyOps() + self.save_activations = save_activations def get_loss( self, examples: Iterable[Example], scores: List[Floats2d] @@ -146,31 +154,24 @@ def get_loss( return float(loss), d_scores - def predict(self, docs: Iterable[Doc]) -> List[Ints2d]: - if self.top_k == 1: - scores2guesses = self._scores2guesses_top_k_equals_1 - elif self.top_k <= TOP_K_GUARDRAIL: - scores2guesses = self._scores2guesses_top_k_greater_1 - else: - scores2guesses = self._scores2guesses_top_k_guardrail - # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values - # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used - # for its principal purpose of lemmatizing tokens. However, the code could also - # be used for other purposes, and with very large values of *top_k* the method - # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used - # instead. + def predict(self, docs: Iterable[Doc]) -> ActivationsT: n_docs = len(list(docs)) if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. n_labels = len(self.cfg["labels"]) - guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs] + guesses: List[Ints1d] = [ + self.model.ops.alloc((0,), dtype="i") for doc in docs + ] + scores: List[Floats2d] = [ + self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs + ] assert len(guesses) == n_docs - return guesses + return {"probabilities": scores, "tree_ids": guesses} scores = self.model.predict(docs) assert len(scores) == n_docs guesses = scores2guesses(docs, scores) assert len(guesses) == n_docs - return guesses + return {"probabilities": scores, "tree_ids": guesses} def _scores2guesses_top_k_equals_1(self, docs, scores): guesses = [] @@ -230,8 +231,13 @@ def _scores2guesses_top_k_guardrail(self, docs, scores): return guesses - def set_annotations(self, docs: Iterable[Doc], batch_tree_ids): + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): + batch_tree_ids = activations["tree_ids"] for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] doc_tree_ids = batch_tree_ids[i] if hasattr(doc_tree_ids, "get"): doc_tree_ids = doc_tree_ids.get() diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index a730ece1bfa..bab79282d5b 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,3 +1,10 @@ +from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any +from typing import cast +from numpy import dtype +from thinc.types import Floats1d, Floats2d, Ints1d, Ragged +from pathlib import Path +from itertools import islice +import srsly import random from itertools import islice from pathlib import Path @@ -21,6 +28,11 @@ from .pipe import deserialize_config from .trainable_pipe import TrainablePipe + +ActivationsT = Dict[str, Union[List[Ragged], List[str]]] + +KNOWLEDGE_BASE_IDS = "kb_ids" + # See #9050 BACKWARD_OVERWRITE = True @@ -60,6 +72,7 @@ "use_gold_ents": True, "candidates_batch_size": 1, "threshold": None, + "save_activations": False, }, default_score_weights={ "nel_micro_f": 1.0, @@ -87,6 +100,7 @@ def make_entity_linker( use_gold_ents: bool, candidates_batch_size: int, threshold: Optional[float] = None, + save_activations: bool, ): """Construct an EntityLinker component. @@ -110,6 +124,7 @@ def make_entity_linker( candidates_batch_size (int): Size of batches for entity candidate generation. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. + save_activations (bool): save model activations in Doc when annotating. """ if not model.attrs.get("include_span_maker", False): @@ -144,6 +159,7 @@ def make_entity_linker( use_gold_ents=use_gold_ents, candidates_batch_size=candidates_batch_size, threshold=threshold, + save_activations=save_activations, ) @@ -185,6 +201,7 @@ def __init__( use_gold_ents: bool, candidates_batch_size: int, threshold: Optional[float] = None, + save_activations: bool = False, ) -> None: """Initialize an entity linker. @@ -239,6 +256,7 @@ def __init__( self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size self.threshold = threshold + self.save_activations = save_activations if candidates_batch_size < 1: raise ValueError(Errors.E1044) @@ -427,7 +445,7 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): loss = loss / len(entity_encodings) return float(loss), out - def predict(self, docs: Iterable[Doc]) -> List[str]: + def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. Returns the KB IDs for each entity in each doc, including NIL if there is no prediction. @@ -440,129 +458,138 @@ def predict(self, docs: Iterable[Doc]) -> List[str]: self.validate_kb() entity_count = 0 final_kb_ids: List[str] = [] - xp = self.model.ops.xp + ops = self.model.ops + xp = ops.xp + docs_ents: List[Ragged] = [] + docs_scores: List[Ragged] = [] if not docs: - return final_kb_ids + return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores} if isinstance(docs, Doc): docs = [docs] - for i, doc in enumerate(docs): + for doc in docs: + doc_ents: List[Ints1d] = [] + doc_scores: List[Floats1d] = [] if len(doc) == 0: + docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0))) + docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0))) continue sentences = [s for s in doc.sents] - # Loop over entities in batches. - for ent_idx in range(0, len(doc.ents), self.candidates_batch_size): - ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size] - - # Look up candidate entities. - valid_ent_idx = [ - idx - for idx in range(len(ent_batch)) - if ent_batch[idx].label_ not in self.labels_discard - ] - - batch_candidates = list( - self.get_candidates_batch( - self.kb, [ent_batch[idx] for idx in valid_ent_idx] - ) - if self.candidates_batch_size > 1 - else [ - self.get_candidates(self.kb, ent_batch[idx]) - for idx in valid_ent_idx - ] - ) - - # Looping through each entity in batch (TODO: rewrite) - for j, ent in enumerate(ent_batch): - assert hasattr(ent, "sents") - sents = list(ent.sents) - sent_indices = ( - sentences.index(sents[0]), - sentences.index(sents[-1]), + if self.incl_context: + # get n_neighbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model.predict([sent_doc])[0] + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + entity_count += 1 + if ent.label_ in self.labels_discard: + # ignoring this entity - setting to NIL + final_kb_ids.append(self.NIL) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=[0.0], + ents=[0], ) - assert sent_indices[1] >= sent_indices[0] >= 0 - - if self.incl_context: - # get n_neighbour sentences, clipped to the length of the document - start_sentence = max(0, sent_indices[0] - self.n_sents) - end_sentence = min( - len(sentences) - 1, sent_indices[1] + self.n_sents - ) - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - sent_doc = doc[start_token:end_token].as_doc() - - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model.predict([sent_doc])[0] - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) - entity_count += 1 - if ent.label_ in self.labels_discard: - # ignoring this entity - setting to NIL + else: + candidates = list(self.get_candidates(self.kb, ent)) + if not candidates: + # no prediction possible for this entity - setting to NIL final_kb_ids.append(self.NIL) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=[0.0], + ents=[0], + ) + elif len(candidates) == 1 and self.threshold is None: + # shortcut for efficiency reasons: take the 1 candidate + final_kb_ids.append(candidates[0].entity_) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=[1.0], + ents=[candidates[0].entity_], + ) else: - candidates = list(batch_candidates[j]) - if not candidates: - # no prediction possible for this entity - setting to NIL - final_kb_ids.append(self.NIL) - elif len(candidates) == 1 and self.threshold is None: - # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) - else: - random.shuffle(candidates) - # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: - prior_probs = xp.asarray([0.0 for _ in candidates]) - scores = prior_probs - # add in similarity from the context - if self.incl_context: - entity_encodings = xp.asarray( - [c.entity_vector for c in candidates] - ) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError( - Errors.E147.format( - method="predict", - msg="vectors not of equal length", - ) + random.shuffle(candidates) + # set all prior probabilities to 0 if incl_prior=False + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.incl_prior: + prior_probs = xp.asarray([0.0 for _ in candidates]) + scores = prior_probs + # add in similarity from the context + if self.incl_context: + entity_encodings = xp.asarray( + [c.entity_vector for c in candidates] + ) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + if len(entity_encodings) != len(prior_probs): + raise RuntimeError( + Errors.E147.format( + method="predict", + msg="vectors not of equal length", ) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / ( - sentence_norm * entity_norm ) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs * sims) - final_kb_ids.append( - candidates[scores.argmax().item()].entity_ - if self.threshold is None - or scores.max() >= self.threshold - else EntityLinker.NIL + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / ( + sentence_norm * entity_norm ) - + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs * sims) + final_kb_ids.append( + candidates[scores.argmax().item()].entity_ + if self.threshold is None or scores.max() >= self.threshold + else EntityLinker.NIL + ) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=scores, + ents=[c.entity for c in candidates], + ) + self._add_doc_activations( + docs_scores=docs_scores, + docs_ents=docs_ents, + doc_scores=doc_scores, + doc_ents=doc_ents, + ) if not (len(final_kb_ids) == entity_count): err = Errors.E147.format( method="predict", msg="result variables not of equal length" ) raise RuntimeError(err) - return final_kb_ids + return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores} - def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None: + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict. + activations (ActivationsT): The activations used for setting annotations, produced + by EntityLinker.predict. DOCS: https://spacy.io/api/entitylinker#set_annotations """ + kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS]) count_ents = len([ent for doc in docs for ent in doc.ents]) if count_ents != len(kb_ids): raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) i = 0 overwrite = self.cfg["overwrite"] - for doc in docs: + for j, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + if act_name != KNOWLEDGE_BASE_IDS: + # We only copy activations that are Ragged. + doc.activations[self.name][act_name] = cast(Ragged, acts[j]) + for ent in doc.ents: kb_id = kb_ids[i] i += 1 @@ -661,3 +688,32 @@ def rehearse(self, examples, *, sgd=None, losses=None, **config): def add_label(self, label): raise NotImplementedError + + def _add_doc_activations( + self, + *, + docs_scores: List[Ragged], + docs_ents: List[Ragged], + doc_scores: List[Floats1d], + doc_ents: List[Ints1d], + ): + if not self.save_activations: + return + ops = self.model.ops + lengths = ops.asarray1i([s.shape[0] for s in doc_scores]) + docs_scores.append(Ragged(ops.flatten(doc_scores), lengths)) + docs_ents.append(Ragged(ops.flatten(doc_ents), lengths)) + + def _add_activations( + self, + *, + doc_scores: List[Floats1d], + doc_ents: List[Ints1d], + scores: Sequence[float], + ents: Sequence[int], + ): + if not self.save_activations: + return + ops = self.model.ops + doc_scores.append(ops.asarray1f(scores)) + doc_ents.append(ops.asarray1i(ents, dtype="uint64")) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index bdbe75fd824..cc8f87936b9 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,4 +1,8 @@ -# cython: infer_types=True, binding=True +# cython: infer_types=True, profile=True, binding=True +from typing import Callable, Dict, Iterable, List, Optional, Union +import srsly +from thinc.api import SequenceCategoricalCrossentropy, Model, Config +from thinc.types import Floats2d, Ints1d from itertools import islice from typing import Callable, Dict, Optional, Union @@ -8,6 +12,12 @@ from ..morphology cimport Morphology from ..tokens.doc cimport Doc from ..vocab cimport Vocab +from ..parts_of_speech import IDS as POS_IDS +from ..symbols import POS +from ..language import Language +from ..errors import Errors +from .pipe import deserialize_config +from .tagger import ActivationsT, Tagger from .. import util from ..errors import Errors from ..language import Language @@ -50,8 +60,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "morphologizer", assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, - "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0}, + default_config={ + "model": DEFAULT_MORPH_MODEL, + "overwrite": True, + "extend": False, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, + "save_activations": False, + }, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( @@ -62,8 +77,10 @@ def make_morphologizer( extend: bool, label_smoothing: float, scorer: Optional[Callable], + save_activations: bool, ): - return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer) + return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, + save_activations=save_activations) def morphologizer_score(examples, **kwargs): @@ -99,6 +116,7 @@ class Morphologizer(Tagger): extend: bool = BACKWARD_EXTEND, label_smoothing: float = 0.0, scorer: Optional[Callable] = morphologizer_score, + save_activations: bool = False, ): """Initialize a morphologizer. @@ -109,6 +127,7 @@ class Morphologizer(Tagger): scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr_per_feat for the attribute "morph". + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/morphologizer#init """ @@ -129,6 +148,7 @@ class Morphologizer(Tagger): } self.cfg = dict(sorted(cfg.items())) self.scorer = scorer + self.save_activations = save_activations @property def labels(self): @@ -222,14 +242,15 @@ class Morphologizer(Tagger): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - batch_tag_ids: The IDs to set, produced by Morphologizer.predict. + activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict. DOCS: https://spacy.io/api/morphologizer#set_annotations """ + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc @@ -240,6 +261,10 @@ class Morphologizer(Tagger): # to allocate a compatible container out of the iterable. labels = tuple(self.labels) for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index df093baa9c6..521afe1d181 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,12 +1,16 @@ -# cython: infer_types=True, binding=True +# cython: infer_types=True, profile=True, binding=True +from typing import Dict, Iterable, Optional, Callable, List, Union from itertools import islice from typing import Callable, Optional -from thinc.api import Config, Model, SequenceCategoricalCrossentropy +import srsly +from thinc.api import Model, SequenceCategoricalCrossentropy, Config +from thinc.types import Floats2d, Ints1d from ..tokens.doc cimport Doc -from .. import util +from .tagger import ActivationsT, Tagger +from ..language import Language from ..errors import Errors from ..language import Language from ..scorer import Scorer @@ -37,11 +41,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "senter", assigns=["token.is_sent_start"], - default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, + default_config={ + "model": DEFAULT_SENTER_MODEL, + "overwrite": False, + "scorer": {"@scorers": "spacy.senter_scorer.v1"}, + "save_activations": False, + }, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) -def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]): - return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) +def make_senter(nlp: Language, + name: str, + model: Model, + overwrite: bool, + scorer: Optional[Callable], + save_activations: bool): + return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations) def senter_score(examples, **kwargs): @@ -71,6 +85,7 @@ class SentenceRecognizer(Tagger): *, overwrite=BACKWARD_OVERWRITE, scorer=senter_score, + save_activations: bool = False, ): """Initialize a sentence recognizer. @@ -80,6 +95,7 @@ class SentenceRecognizer(Tagger): losses during training. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/sentencerecognizer#init """ @@ -89,6 +105,7 @@ class SentenceRecognizer(Tagger): self._rehearsal_model = None self.cfg = {"overwrite": overwrite} self.scorer = scorer + self.save_activations = save_activations @property def labels(self): @@ -106,19 +123,24 @@ class SentenceRecognizer(Tagger): def label_data(self): return None - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict. + activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict. DOCS: https://spacy.io/api/sentencerecognizer#set_annotations """ + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef bint overwrite = self.cfg["overwrite"] for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 08a5478a912..1450bb5d6cb 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,6 +1,8 @@ -from dataclasses import dataclass -from functools import partial -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast +from typing import Union +from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops +from thinc.api import Optimizer +from thinc.types import Ragged, Ints2d, Floats2d, Ints1d import numpy from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate @@ -16,6 +18,9 @@ from ..vocab import Vocab from .trainable_pipe import TrainablePipe +ActivationsT = Dict[str, Union[Floats2d, Ragged]] + + spancat_default_config = """ [model] @architectures = "spacy.SpanCategorizer.v1" @@ -170,6 +175,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester: "model": DEFAULT_SPANCAT_MODEL, "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + "save_activations": False, }, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, ) @@ -182,6 +188,7 @@ def make_spancat( scorer: Optional[Callable], threshold: float, max_positive: Optional[int], + save_activations: bool, ) -> "SpanCategorizer": """Create a SpanCategorizer component and configure it for multi-label classification to be able to assign multiple labels for each span. @@ -209,6 +216,7 @@ def make_spancat( 0.5. max_positive (Optional[int]): Maximum number of labels to consider positive per span. Defaults to None, indicating no limit. + save_activations (bool): save model activations in Doc when annotating. """ return SpanCategorizer( nlp.vocab, @@ -287,6 +295,7 @@ def make_spancat_singlelabel( add_negative_label=True, threshold=None, scorer=scorer, + save_activations=save_activations, ) @@ -349,6 +358,7 @@ def __init__( max_positive: Optional[int] = None, threshold: Optional[float] = 0.5, scorer: Optional[Callable] = spancat_score, + save_activations: bool = False, ) -> None: """Initialize the multi-label or multi-class span categorizer. @@ -398,9 +408,7 @@ def __init__( self.model = model self.name = name self.scorer = scorer - self.add_negative_label = add_negative_label - if not allow_overlap and max_positive is not None and max_positive > 1: - raise ValueError(Errors.E1051.format(max_positive=max_positive)) + self.save_activations = save_activations @property def key(self) -> str: @@ -458,28 +466,7 @@ def label_data(self) -> List[str]: """ return list(self.labels) - @property - def _label_map(self) -> Dict[str, int]: - """RETURNS (Dict[str, int]): The label map.""" - return {label: i for i, label in enumerate(self.labels)} - - @property - def _n_labels(self) -> int: - """RETURNS (int): Number of labels.""" - if self.add_negative_label: - return len(self.labels) + 1 - else: - return len(self.labels) - - @property - def _negative_label_i(self) -> Union[int, None]: - """RETURNS (Union[int, None]): Index of the negative label.""" - if self.add_negative_label: - return len(self.label_data) - else: - return None - - def predict(self, docs: Iterable[Doc]): + def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. docs (Iterable[Doc]): The documents to predict. @@ -488,11 +475,8 @@ def predict(self, docs: Iterable[Doc]): DOCS: https://spacy.io/api/spancategorizer#predict """ indices = self.suggester(docs, ops=self.model.ops) - if indices.lengths.sum() == 0: - scores = self.model.ops.alloc2f(0, 0) - else: - scores = self.model.predict((docs, indices)) # type: ignore - return indices, scores + scores = self.model.predict((docs, indices)) # type: ignore + return {"indices": indices, "scores": scores} def set_candidates( self, docs: Iterable[Doc], *, candidates_key: str = "candidates" @@ -512,32 +496,32 @@ def set_candidates( for index in candidates.dataXd: doc.spans[candidates_key].append(doc[index[0] : index[1]]) - def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None: + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of Doc objects, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - scores: The scores to set, produced by SpanCategorizer.predict. + activations: ActivationsT: The activations, produced by SpanCategorizer.predict. DOCS: https://spacy.io/api/spancategorizer#set_annotations """ - indices, scores = indices_scores + labels = self.labels + + indices = activations["indices"] + assert isinstance(indices, Ragged) + scores = cast(Floats2d, activations["scores"]) + offset = 0 for i, doc in enumerate(docs): indices_i = indices[i].dataXd - allow_overlap = cast(bool, self.cfg["allow_overlap"]) - if self.cfg["max_positive"] == 1: - doc.spans[self.key] = self._make_span_group_singlelabel( - doc, - indices_i, - scores[offset : offset + indices.lengths[i]], - allow_overlap, - ) - else: - doc.spans[self.key] = self._make_span_group_multilabel( - doc, - indices_i, - scores[offset : offset + indices.lengths[i]], - ) + if self.save_activations: + doc.activations[self.name] = {} + doc.activations[self.name]["indices"] = indices_i + doc.activations[self.name]["scores"] = scores[ + offset : offset + indices.lengths[i] + ] + doc.spans[self.key] = self._make_span_group( + doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type] + ) offset += indices.lengths[i] def update( diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 34e85d49c2b..8ecd0c46ee0 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -1,4 +1,10 @@ -# cython: infer_types=True, binding=True +# cython: infer_types=True, profile=True, binding=True +from typing import Callable, Dict, Iterable, List, Optional, Union +import numpy +import srsly +from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config +from thinc.types import Floats2d, Ints1d +import warnings from itertools import islice from typing import Callable, Optional @@ -15,6 +21,9 @@ from ..training import validate_examples, validate_get_examples from ..util import registry from .trainable_pipe import TrainablePipe + +ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] + # See #9050 BACKWARD_OVERWRITE = False @@ -38,7 +47,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0}, + default_config={ + "model": DEFAULT_TAGGER_MODEL, + "overwrite": False, + "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, + "neg_prefix": "!", + "save_activations": False, + }, default_score_weights={"tag_acc": 1.0}, ) def make_tagger( @@ -48,7 +63,7 @@ def make_tagger( overwrite: bool, scorer: Optional[Callable], neg_prefix: str, - label_smoothing: float, + save_activations: bool, ): """Construct a part-of-speech tagger component. @@ -57,7 +72,8 @@ def make_tagger( in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to 1). """ - return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing) + return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, + save_activations=save_activations) def tagger_score(examples, **kwargs): @@ -83,7 +99,7 @@ class Tagger(TrainablePipe): overwrite=BACKWARD_OVERWRITE, scorer=tagger_score, neg_prefix="!", - label_smoothing=0.0, + save_activations: bool = False, ): """Initialize a part-of-speech tagger. @@ -93,6 +109,7 @@ class Tagger(TrainablePipe): losses during training. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attribute "tag". + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/tagger#init """ @@ -103,6 +120,7 @@ class Tagger(TrainablePipe): cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing} self.cfg = dict(sorted(cfg.items())) self.scorer = scorer + self.save_activations = save_activations @property def labels(self): @@ -121,7 +139,7 @@ class Tagger(TrainablePipe): """Data about the labels currently added to the component.""" return tuple(self.cfg["labels"]) - def predict(self, docs): + def predict(self, docs) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. docs (Iterable[Doc]): The documents to predict. @@ -134,12 +152,12 @@ class Tagger(TrainablePipe): n_labels = len(self.labels) guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] assert len(guesses) == len(docs) - return guesses + return {"probabilities": guesses, "label_ids": guesses} scores = self.model.predict(docs) assert len(scores) == len(docs), (len(scores), len(docs)) guesses = self._scores2guesses(scores) assert len(guesses) == len(docs) - return guesses + return {"probabilities": scores, "label_ids": guesses} def _scores2guesses(self, scores): guesses = [] @@ -150,20 +168,25 @@ class Tagger(TrainablePipe): guesses.append(doc_guesses) return guesses - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - batch_tag_ids: The IDs to set, produced by Tagger.predict. + activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict. DOCS: https://spacy.io/api/tagger#set_annotations """ + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef bint overwrite = self.cfg["overwrite"] labels = self.labels for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index ae227017a9f..6cb33109891 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,3 +1,7 @@ +from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union +from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config +from thinc.types import Floats2d +import numpy from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple @@ -14,6 +18,9 @@ from ..vocab import Vocab from .trainable_pipe import TrainablePipe +ActivationsT = Dict[str, Floats2d] + + single_label_default_config = """ [model] @architectures = "spacy.TextCatEnsemble.v2" @@ -80,7 +87,8 @@ default_config={ "threshold": 0.0, "model": DEFAULT_SINGLE_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, + "scorer": {"@scorers": "spacy.textcat_scorer.v1"}, + "save_activations": False, }, default_score_weights={ "cats_score": 1.0, @@ -101,6 +109,7 @@ def make_textcat( model: Model[List[Doc], List[Floats2d]], threshold: float, scorer: Optional[Callable], + save_activations: bool, ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered @@ -110,8 +119,16 @@ def make_textcat( scores for each category. threshold (float): Cutoff to consider a prediction "positive". scorer (Optional[Callable]): The scoring method. + save_activations (bool): save model activations in Doc when annotating. """ - return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer) + return TextCategorizer( + nlp.vocab, + model, + name, + threshold=threshold, + scorer=scorer, + save_activations=save_activations, + ) def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: @@ -142,6 +159,7 @@ def __init__( *, threshold: float, scorer: Optional[Callable] = textcat_score, + save_activations: bool = False, ) -> None: """Initialize a text categorizer for single-label classification. @@ -167,6 +185,7 @@ def __init__( } self.cfg = dict(cfg) self.scorer = scorer + self.save_activations = save_activations @property def support_missing_values(self): @@ -191,7 +210,7 @@ def label_data(self) -> List[str]: """ return self.labels # type: ignore[return-value] - def predict(self, docs: Iterable[Doc]): + def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. docs (Iterable[Doc]): The documents to predict. @@ -204,12 +223,12 @@ def predict(self, docs: Iterable[Doc]): tensors = [doc.tensor for doc in docs] xp = self.model.ops.xp scores = xp.zeros((len(list(docs)), len(self.labels))) - return scores + return {"probabilities": scores} scores = self.model.predict(docs) scores = self.model.ops.asarray(scores) - return scores + return {"probabilities": scores} - def set_annotations(self, docs: Iterable[Doc], scores) -> None: + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of Doc objects, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. @@ -217,9 +236,13 @@ def set_annotations(self, docs: Iterable[Doc], scores) -> None: DOCS: https://spacy.io/api/textcategorizer#set_annotations """ + probs = activations["probabilities"] for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + doc.activations[self.name]["probabilities"] = probs[i] for j, label in enumerate(self.labels): - doc.cats[label] = float(scores[i, j]) + doc.cats[label] = float(probs[i, j]) def update( self, diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 2f8d5e60437..ac024ba3639 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,3 +1,7 @@ +from typing import Iterable, Optional, Dict, List, Callable, Any, Union +from thinc.types import Floats2d +from thinc.api import Model, Config + from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional @@ -78,7 +82,8 @@ default_config={ "threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"}, + "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, + "save_activations": False, }, default_score_weights={ "cats_score": 1.0, @@ -99,8 +104,9 @@ def make_multilabel_textcat( model: Model[List[Doc], List[Floats2d]], threshold: float, scorer: Optional[Callable], -) -> "MultiLabel_TextCategorizer": - """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories + save_activations: bool, +) -> "TextCategorizer": + """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered to be non-mutually exclusive, which means that there can be zero or more labels per doc). @@ -111,7 +117,12 @@ def make_multilabel_textcat( scorer (Optional[Callable]): The scoring method. """ return MultiLabel_TextCategorizer( - nlp.vocab, model, name, threshold=threshold, scorer=scorer + nlp.vocab, + model, + name, + threshold=threshold, + scorer=scorer, + save_activations=save_activations, ) @@ -143,6 +154,7 @@ def __init__( *, threshold: float, scorer: Optional[Callable] = textcat_multilabel_score, + save_activations: bool = False, ) -> None: """Initialize a text categorizer for multi-label classification. @@ -151,7 +163,7 @@ def __init__( name (str): The component instance name, used to add entries to the losses during training. threshold (float): Cutoff to consider a prediction "positive". - scorer (Optional[Callable]): The scoring method. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/textcategorizer#init """ @@ -162,6 +174,7 @@ def __init__( cfg = {"labels": [], "threshold": threshold} self.cfg = dict(cfg) self.scorer = scorer + self.save_activations = save_activations @property def support_missing_values(self): diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd index b1d2550a1ce..3e9a0a9584d 100644 --- a/spacy/pipeline/trainable_pipe.pxd +++ b/spacy/pipeline/trainable_pipe.pxd @@ -7,3 +7,4 @@ cdef class TrainablePipe(Pipe): cdef public object model cdef public object cfg cdef public object scorer + cdef bint _save_activations diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index 8f219b32797..bd360c9501b 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -2,10 +2,14 @@ from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple import srsly -from thinc.api import Model, Optimizer, set_dropout_rate +from thinc.api import set_dropout_rate, Model, Optimizer +import warnings from ..tokens.doc cimport Doc +from ..training import validate_examples +from ..errors import Errors, Warnings +from .pipe import Pipe, deserialize_config from .. import util from ..errors import Errors from ..language import Language @@ -342,3 +346,11 @@ cdef class TrainablePipe(Pipe): deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) return self + + @property + def save_activations(self): + return self._save_activations + + @save_activations.setter + def save_activations(self, save_activations: bool): + self._save_activations = save_activations diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index 5a8f0aee2ab..ba2ed4e5ff3 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -1,3 +1,4 @@ +from typing import cast import pickle import hypothesis.strategies as st @@ -8,6 +9,8 @@ from spacy.lang.en import English from spacy.language import Language from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees +from spacy.pipeline.trainable_pipe import TrainablePipe +from spacy.training import Example from spacy.strings import StringStore from spacy.training import Example from spacy.util import make_tempdir @@ -331,3 +334,26 @@ def test_empty_strings(): no_change = trees.add("xyz", "xyz") empty = trees.add("", "") assert no_change == empty + + +def test_save_activations(): + nlp = English() + lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer")) + lemmatizer.min_tree_freq = 1 + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.initialize(get_examples=lambda: train_examples) + nO = lemmatizer.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "trainable_lemmatizer" not in doc.activations + + lemmatizer.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["trainable_lemmatizer"].keys()) == [ + "probabilities", + "tree_ids", + ] + assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO) + assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 00771a0f0f8..844bacb3b1f 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,7 +1,8 @@ -from typing import Any, Callable, Dict, Iterable, Tuple +from typing import Callable, Iterable, Dict, Any, cast import pytest from numpy.testing import assert_equal +from thinc.types import Ragged from spacy import Language, registry, util from spacy.attrs import ENT_KB_ID @@ -9,8 +10,7 @@ from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates from spacy.lang.en import English from spacy.ml import load_kb -from spacy.ml.models.entity_linker import build_span_maker -from spacy.pipeline import EntityLinker +from spacy.pipeline import EntityLinker, TrainablePipe from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer @@ -1194,16 +1194,64 @@ def create_kb(vocab): assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL -def test_span_maker_forward_with_empty(): - """The forward pass of the span maker may have a doc with no entities.""" +def test_save_activations(): nlp = English() - doc1 = nlp("a b c") - ent = doc1[0:1] - ent.label_ = "X" - doc1.ents = [ent] - # no entities - doc2 = nlp("x y z") - - # just to get a model - span_maker = build_span_maker() - span_maker([doc1, doc2], False) + vector_length = 3 + assert "Q2146908" not in nlp.vocab.strings + + # Convert the texts to docs to make sure we have doc.ents set for the training examples + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB - assign same prior weight to the two russ cochran's + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) + return mykb + + # Create the Entity Linker component and add it to the pipeline + entity_linker = cast(TrainablePipe, nlp.add_pipe("entity_linker", last=True)) + assert isinstance(entity_linker, EntityLinker) + entity_linker.set_kb(create_kb) + assert "Q2146908" in entity_linker.vocab.strings + assert "Q2146908" in entity_linker.kb.vocab.strings + + # initialize the NEL pipe + nlp.initialize(get_examples=lambda: train_examples) + + nO = entity_linker.model.get_dim("nO") + + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + doc = nlp("Russ Cochran was a publisher") + assert "entity_linker" not in doc.activations + + entity_linker.save_activations = True + doc = nlp("Russ Cochran was a publisher") + assert set(doc.activations["entity_linker"].keys()) == {"ents", "scores"} + ents = doc.activations["entity_linker"]["ents"] + assert isinstance(ents, Ragged) + assert ents.data.shape == (2, 1) + assert ents.data.dtype == "uint64" + assert ents.lengths.shape == (1,) + scores = doc.activations["entity_linker"]["scores"] + assert isinstance(scores, Ragged) + assert scores.data.shape == (2, 1) + assert scores.data.dtype == "float32" + assert scores.lengths.shape == (1,) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 0d895f23688..c2b65977ac3 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_almost_equal, assert_equal from thinc.api import get_current_ops @@ -7,7 +8,8 @@ from spacy.lang.en import English from spacy.language import Language from spacy.morphology import Morphology -from spacy.tests.util import make_tempdir +from spacy.pipeline import TrainablePipe +from spacy.attrs import MORPH from spacy.tokens import Doc from spacy.training import Example @@ -224,3 +226,25 @@ def test_overfitting_IO(): gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags + + +def test_save_activations(): + nlp = English() + morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer")) + train_examples = [] + for inst in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) + nlp.initialize(get_examples=lambda: train_examples) + + doc = nlp("This is a test.") + assert "morphologizer" not in doc.activations + + morphologizer.save_activations = True + doc = nlp("This is a test.") + assert "morphologizer" in doc.activations + assert set(doc.activations["morphologizer"].keys()) == { + "label_ids", + "probabilities", + } + assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6) + assert doc.activations["morphologizer"]["label_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 6c76558123f..2e40d86ff48 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_equal @@ -5,6 +6,7 @@ from spacy.attrs import SENT_START from spacy.lang.en import English from spacy.language import Language +from spacy.pipeline import TrainablePipe from spacy.tests.util import make_tempdir from spacy.training import Example @@ -101,3 +103,26 @@ def test_overfitting_IO(): # test internal pipe labels vs. Language.pipe_labels with hidden labels assert nlp.get_pipe("senter").labels == ("I", "S") assert "senter" not in nlp.pipe_labels + + +def test_save_activations(): + # Test if activations are correctly added to Doc when requested. + nlp = English() + senter = cast(TrainablePipe, nlp.add_pipe("senter")) + + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + nO = senter.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "senter" not in doc.activations + + senter.save_activations = True + doc = nlp("This is a test.") + assert "senter" in doc.activations + assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"} + assert doc.activations["senter"]["probabilities"].shape == (5, nO) + assert doc.activations["senter"]["label_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index c143d193fa6..9678e9b63b8 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -594,19 +594,21 @@ def test_set_candidates(name): assert docs[0].spans["candidates"][4].text == "Just a" -@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) -@pytest.mark.parametrize("n_process", [1, 2]) -def test_spancat_multiprocessing(name, n_process): - if isinstance(get_current_ops, NumpyOps) or n_process < 2: - nlp = Language() - spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) - train_examples = make_examples(nlp) - nlp.initialize(get_examples=lambda: train_examples) - texts = [ - "Just a sentence.", - "I like London and Berlin", - "I like Berlin", - "I eat ham.", - ] - docs = list(nlp.pipe(texts, n_process=n_process)) - assert len(docs) == len(texts) +def test_save_activations(): + # Test if activations are correctly added to Doc when requested. + nlp = English() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + train_examples = make_examples(nlp) + nlp.initialize(get_examples=lambda: train_examples) + nO = spancat.model.get_dim("nO") + assert nO == 2 + assert set(spancat.labels) == {"LOC", "PERSON"} + + doc = nlp("This is a test.") + assert "spancat" not in doc.activations + + spancat.save_activations = True + doc = nlp("This is a test.") + assert set(doc.activations["spancat"].keys()) == {"indices", "scores"} + assert doc.activations["spancat"]["indices"].shape == (12, 2) + assert doc.activations["spancat"]["scores"].shape == (12, nO) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 4b5f1ee99fc..5deb323dd71 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_almost_equal, assert_equal from thinc.api import compounding, get_current_ops @@ -6,7 +7,8 @@ from spacy.attrs import TAG from spacy.lang.en import English from spacy.language import Language -from spacy.training import Example +from spacy.pipeline import TrainablePipe +from thinc.api import compounding from ..util import make_tempdir @@ -235,6 +237,26 @@ def test_overfitting_IO(): assert doc3[0].tag_ != "N" +def test_save_activations(): + # Test if activations are correctly added to Doc when requested. + nlp = English() + tagger = cast(TrainablePipe, nlp.add_pipe("tagger")) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.initialize(get_examples=lambda: train_examples) + + doc = nlp("This is a test.") + assert "tagger" not in doc.activations + + tagger.save_activations = True + doc = nlp("This is a test.") + assert "tagger" in doc.activations + assert set(doc.activations["tagger"].keys()) == {"label_ids", "probabilities"} + assert doc.activations["tagger"]["probabilities"].shape == (5, len(TAGS)) + assert doc.activations["tagger"]["label_ids"].shape == (5,) + + def test_tagger_requires_labels(): nlp = English() nlp.add_pipe("tagger") diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 8a0c1a9760d..710dac0571d 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,3 +1,4 @@ +from typing import cast import random import numpy.random @@ -11,17 +12,13 @@ from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat from spacy.lang.en import English from spacy.language import Language -from spacy.pipeline import TextCategorizer -from spacy.pipeline.textcat import ( - single_label_bow_config, - single_label_cnn_config, - single_label_default_config, -) -from spacy.pipeline.textcat_multilabel import ( - multi_label_bow_config, - multi_label_cnn_config, - multi_label_default_config, -) +from spacy.pipeline import TextCategorizer, TrainablePipe +from spacy.pipeline.textcat import single_label_bow_config +from spacy.pipeline.textcat import single_label_cnn_config +from spacy.pipeline.textcat import single_label_default_config +from spacy.pipeline.textcat_multilabel import multi_label_bow_config +from spacy.pipeline.textcat_multilabel import multi_label_cnn_config +from spacy.pipeline.textcat_multilabel import multi_label_default_config from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tokens import Doc, DocBin @@ -298,7 +295,7 @@ def test_issue9904(): nlp.initialize(get_examples) examples = get_examples() - scores = textcat.predict([eg.predicted for eg in examples]) + scores = textcat.predict([eg.predicted for eg in examples])["probabilities"] loss = textcat.get_loss(examples, scores)[0] loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] @@ -949,24 +946,39 @@ def test_textcat_multi_threshold(): assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 -@pytest.mark.parametrize( - "component_name,scorer", - [ - ("textcat", "spacy.textcat_scorer.v1"), - ("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"), - ], -) -def test_textcat_legacy_scorers(component_name, scorer): - """Check that legacy scorers are registered and produce the expected score - keys.""" +def test_save_activations(): nlp = English() - nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}}) + textcat = cast(TrainablePipe, nlp.add_pipe("textcat")) train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) nlp.initialize(get_examples=lambda: train_examples) + nO = textcat.model.get_dim("nO") - # score the model (it's not actually trained but that doesn't matter) - scores = nlp.evaluate(train_examples) - assert 0 <= scores["cats_score"] <= 1 + doc = nlp("This is a test.") + assert "textcat" not in doc.activations + + textcat.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["textcat"].keys()) == ["probabilities"] + assert doc.activations["textcat"]["probabilities"].shape == (nO,) + + +def test_save_activations_multi(): + nlp = English() + textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel")) + + train_examples = [] + for text, annotations in TRAIN_DATA_MULTI_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + nlp.initialize(get_examples=lambda: train_examples) + nO = textcat.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "textcat_multilabel" not in doc.activations + + textcat.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"] + assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index d9719609cdc..5e8975ed337 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -50,6 +50,8 @@ cdef class Doc: cdef public float sentiment + cdef public dict activations + cdef public dict user_hooks cdef public dict user_token_hooks cdef public dict user_span_hooks diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 0fae118b4b6..5fda6f2f789 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -16,7 +16,7 @@ from typing import ( import numpy as np from cymem.cymem import Pool -from thinc.types import Floats1d, Floats2d, Ints2d +from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged from .span import Span from .token import Token from .span_groups import SpanGroups @@ -41,6 +41,7 @@ class Doc: max_length: int length: int sentiment: float + activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]] cats: Dict[str, float] user_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 8db8c1d6f37..497656b6570 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -281,6 +281,7 @@ cdef class Doc: self.length = 0 self.sentiment = 0.0 self.cats = {} + self.activations = {} self.user_hooks = {} self.user_token_hooks = {} self.user_span_hooks = {} diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index 0a582650076..310ce0dc88d 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -752,22 +752,23 @@ The L2 norm of the document's vector representation. ## Attributes {id="attributes"} -| Name | Description | -| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `text` | A string representation of the document text. ~~str~~ | -| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | -| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | -| `vocab` | The store of lexical types. ~~Vocab~~ | -| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ | -| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | -| `lang` | Language of the document's vocabulary. ~~int~~ | -| `lang_` | Language of the document's vocabulary. ~~str~~ | -| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | -| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | -| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | -| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | -| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | -| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| Name | Description | +| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `text` | A string representation of the document text. ~~str~~ | +| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | +| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | +| `vocab` | The store of lexical types. ~~Vocab~~ | +| `tensor` 2 | Container for dense vector representations. ~~numpy.ndarray~~ | +| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | +| `lang` 2.1 | Language of the document's vocabulary. ~~int~~ | +| `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ | +| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | +| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | +| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | +| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | +| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| `activations` 4.0 | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ | ## Serialization fields {id="serialization-fields"} diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx index 82967482c90..17af19e8c38 100644 --- a/website/docs/api/edittreelemmatizer.mdx +++ b/website/docs/api/edittreelemmatizer.mdx @@ -44,14 +44,15 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer") > ``` -| Setting | Description | -| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `backoff` | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~ | -| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~ | -| `overwrite` | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `top_k` | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ | +| Setting | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `backoff` | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~ | +| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~ | +| `overwrite` | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `top_k` | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"tree_ids"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 21d2e9015ce..85b872151fd 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -53,21 +53,20 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | -| `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Setting | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index ce16f534219..1fda807cb32 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -42,13 +42,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("morphologizer", config=config) > ``` -| Setting | Description | -| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | -| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | -| `label_smoothing` 3.6 | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ | +| Setting | Description | +| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | +| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx @@ -400,8 +400,8 @@ coarse-grained POS as the feature `POS`. > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels > ``` -| Name | Description | -| ----------- | ------------------------------------------------------ | +| Name | Description | +| ----------- | --------------------------------------------------------- | | **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ | ## Morphologizer.label_data {id="label_data",tag="property",version="3"} diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx index 5435399f956..d5d096d7659 100644 --- a/website/docs/api/sentencerecognizer.mdx +++ b/website/docs/api/sentencerecognizer.mdx @@ -39,11 +39,12 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("senter", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | +| Setting | Description | +| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/senter.pyx diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index 98a1948eeab..258db794786 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -62,32 +62,15 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("spancat", config=config) > ``` -> #### Example (spancat_singlelabel) -> -> ```python -> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL -> config = { -> "spans_key": "labeled_spans", -> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, -> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, -> # Additional spancat_singlelabel parameters -> "negative_weight": 0.8, -> "allow_overlap": True, -> } -> nlp.add_pipe("spancat_singlelabel", config=config) -> ``` - -| Setting | Description | -| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | -| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | -| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | -| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ | -| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | -| `add_negative_label` 3.5.1 | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ | -| `negative_weight` 3.5.1 | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ | -| `allow_overlap` 3.5.1 | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ | +| Setting | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~ | diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx index d9b0506fb17..20852e8eb94 100644 --- a/website/docs/api/tagger.mdx +++ b/website/docs/api/tagger.mdx @@ -40,13 +40,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Description | -| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | -| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ | -| `label_smoothing` 3.6 | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ | +| Setting | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | +| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/tagger.pyx diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx index a259b7b3c65..a1dfb6dd88e 100644 --- a/website/docs/api/textcategorizer.mdx +++ b/website/docs/api/textcategorizer.mdx @@ -116,14 +116,15 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | +| Name | Description | +| ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. The supported activations is `"probabilities"`. ~~Union[bool, list[str]]~~ | ## TextCategorizer.\_\_call\_\_ {id="call",tag="method"}