diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 1a29735e8e8..1f16b44cfed 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,7 +4,7 @@
import numpy as np
import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
from thinc.types import ArrayXd, Floats2d, Ints1d
from .. import util
@@ -18,6 +18,10 @@
from .lemmatizer import lemmatizer_score
from .trainable_pipe import TrainablePipe
+# The cutoff value of *top_k* above which an alternative method is used to process guesses.
+TOP_K_GUARDRAIL = 20
+
+
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
@@ -50,6 +54,7 @@
"top_k": 1,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
"save_activations": False,
+ "save_activations": False,
},
default_score_weights={"lemma_acc": 1.0},
)
@@ -63,6 +68,7 @@ def make_edit_tree_lemmatizer(
top_k: int,
scorer: Optional[Callable],
save_activations: bool,
+ save_activations: bool,
):
"""Construct an EditTreeLemmatizer component."""
return EditTreeLemmatizer(
@@ -75,6 +81,7 @@ def make_edit_tree_lemmatizer(
top_k=top_k,
scorer=scorer,
save_activations=save_activations,
+ save_activations=save_activations,
)
@@ -95,6 +102,7 @@ def __init__(
top_k: int = 1,
scorer: Optional[Callable] = lemmatizer_score,
save_activations: bool = False,
+ save_activations: bool = False,
):
"""
Construct an edit tree lemmatizer.
@@ -107,6 +115,7 @@ def __init__(
overwrite (bool): overwrite existing lemma annotations.
top_k (int): try to apply at most the k most probable edit trees.
save_activations (bool): save model activations in Doc when annotating.
+ save_activations (bool): save model activations in Doc when annotating.
"""
self.vocab = vocab
self.model = model
@@ -122,6 +131,7 @@ def __init__(
self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer
self.save_activations = save_activations
+ self.save_activations = save_activations
def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d]
@@ -150,25 +160,6 @@ def get_loss(
return float(loss), d_scores
- def get_teacher_student_loss(
- self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
- ) -> Tuple[float, List[Floats2d]]:
- """Calculate the loss and its gradient for a batch of student
- scores, relative to teacher scores.
-
- teacher_scores: Scores representing the teacher model's predictions.
- student_scores: Scores representing the student model's predictions.
-
- RETURNS (Tuple[float, float]): The loss and the gradient.
-
- DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
- """
- loss_func = SequenceCategoricalCrossentropy(normalize=False)
- d_scores, loss = loss_func(student_scores, teacher_scores)
- if self.model.ops.xp.isnan(loss):
- raise ValueError(Errors.E910.format(name=self.name))
- return float(loss), d_scores
-
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
n_docs = len(list(docs))
if not any(len(doc) for doc in docs):
@@ -180,13 +171,21 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
scores: List[Floats2d] = [
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
]
+ guesses: List[Ints1d] = [
+ self.model.ops.alloc((0,), dtype="i") for doc in docs
+ ]
+ scores: List[Floats2d] = [
+ self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+ ]
assert len(guesses) == n_docs
return {"probabilities": scores, "tree_ids": guesses}
+ return {"probabilities": scores, "tree_ids": guesses}
scores = self.model.predict(docs)
assert len(scores) == n_docs
guesses = scores2guesses(docs, scores)
assert len(guesses) == n_docs
return {"probabilities": scores, "tree_ids": guesses}
+ return {"probabilities": scores, "tree_ids": guesses}
def _scores2guesses_top_k_equals_1(self, docs, scores):
guesses = []
@@ -246,9 +245,15 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
return guesses
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
+ batch_tree_ids = activations["tree_ids"]
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
batch_tree_ids = activations["tree_ids"]
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ doc.activations[self.name][act_name] = acts[i]
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 629a5f193aa..2716d3821e2 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,3 +1,10 @@
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
+from typing import cast
+from numpy import dtype
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from pathlib import Path
+from itertools import islice
+import srsly
import random
import warnings
from itertools import islice
@@ -21,10 +28,14 @@
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
+
ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
KNOWLEDGE_BASE_IDS = "kb_ids"
+# See #9050
+BACKWARD_OVERWRITE = True
+
default_model_config = """
[model]
@architectures = "spacy.EntityLinker.v2"
@@ -61,6 +72,7 @@
"candidates_batch_size": 1,
"threshold": None,
"save_activations": False,
+ "save_activations": False,
},
default_score_weights={
"nel_micro_f": 1.0,
@@ -89,6 +101,7 @@ def make_entity_linker(
candidates_batch_size: int,
threshold: Optional[float] = None,
save_activations: bool,
+ save_activations: bool,
):
"""Construct an EntityLinker component.
@@ -113,6 +126,7 @@ def make_entity_linker(
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
save_activations (bool): save model activations in Doc when annotating.
+ save_activations (bool): save model activations in Doc when annotating.
"""
if not model.attrs.get("include_span_maker", False):
raise ValueError(Errors.E4005)
@@ -135,6 +149,7 @@ def make_entity_linker(
candidates_batch_size=candidates_batch_size,
threshold=threshold,
save_activations=save_activations,
+ save_activations=save_activations,
)
@@ -176,6 +191,7 @@ def __init__(
candidates_batch_size: int,
threshold: Optional[float] = None,
save_activations: bool = False,
+ save_activations: bool = False,
) -> None:
"""Initialize an entity linker.
@@ -230,6 +246,7 @@ def __init__(
self.candidates_batch_size = candidates_batch_size
self.threshold = threshold
self.save_activations = save_activations
+ self.save_activations = save_activations
if candidates_batch_size < 1:
raise ValueError(Errors.E1044)
@@ -437,6 +454,7 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
loss = loss / len(entity_encodings)
return float(loss), out
+ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is
@@ -454,38 +472,48 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
xp = ops.xp
docs_ents: List[Ragged] = []
docs_scores: List[Ragged] = []
+ ops = self.model.ops
+ xp = ops.xp
+ docs_ents: List[Ragged] = []
+ docs_scores: List[Ragged] = []
if not docs:
- return {
- KNOWLEDGE_BASE_IDS: final_kb_ids,
- "ents": docs_ents,
- "scores": docs_scores,
- }
+ return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
if isinstance(docs, Doc):
docs = [docs]
+ for doc in docs:
+ doc_ents: List[Ints1d] = []
+ doc_scores: List[Floats1d] = []
for doc in docs:
doc_ents: List[Ints1d] = []
doc_scores: List[Floats1d] = []
if len(doc) == 0:
+ docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
+ docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
continue
sentences = [s for s in doc.sents]
- # Loop over entities in batches.
- for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
- ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
- # Look up candidate entities.
- valid_ent_idx = [
- idx
- for idx in range(len(ent_batch))
- if ent_batch[idx].label_ not in self.labels_discard
- ]
-
- batch_candidates = list(
- self.get_candidates_batch(
- self.kb,
- SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
+ if self.incl_context:
+ # get n_neighbour sentences, clipped to the length of the document
+ start_sentence = max(0, sent_index - self.n_sents)
+ end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+ start_token = sentences[start_sentence].start
+ end_token = sentences[end_sentence].end
+ sent_doc = doc[start_token:end_token].as_doc()
+ # currently, the context is the same for each entity in a sentence (should be refined)
+ sentence_encoding = self.model.predict([sent_doc])[0]
+ sentence_encoding_t = sentence_encoding.T
+ sentence_norm = xp.linalg.norm(sentence_encoding_t)
+ entity_count += 1
+ if ent.label_ in self.labels_discard:
+ # ignoring this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ self._add_activations(
+ doc_scores=doc_scores,
+ doc_ents=doc_ents,
+ scores=[0.0],
+ ents=[0],
)
else:
candidates = list(self.get_candidates(self.kb, ent))
@@ -519,51 +547,17 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
entity_encodings = xp.asarray(
[c.entity_vector for c in candidates]
)
- elif len(candidates) == 1 and self.threshold is None:
- # shortcut for efficiency reasons: take the 1 candidate
- final_kb_ids.append(candidates[0].entity_id_)
- self._add_activations(
- doc_scores=doc_scores,
- doc_ents=doc_ents,
- scores=[1.0],
- ents=[candidates[0].entity_id],
- )
- else:
- random.shuffle(candidates)
- # set all prior probabilities to 0 if incl_prior=False
- if self.incl_prior and self.kb.supports_prior_probs:
- prior_probs = xp.asarray([c.prior_prob for c in candidates]) # type: ignore
- else:
- prior_probs = xp.asarray([0.0 for _ in candidates])
- scores = prior_probs
- # add in similarity from the context
- if self.incl_context:
- entity_encodings = xp.asarray(
- [c.entity_vector for c in candidates]
- )
- entity_norm = xp.linalg.norm(entity_encodings, axis=1)
- if len(entity_encodings) != len(prior_probs):
- raise RuntimeError(
- Errors.E147.format(
- method="predict",
- msg="vectors not of equal length",
- )
+ entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+ if len(entity_encodings) != len(prior_probs):
+ raise RuntimeError(
+ Errors.E147.format(
+ method="predict",
+ msg="vectors not of equal length",
)
)
- if sims.shape != prior_probs.shape:
- raise ValueError(Errors.E161)
- scores = prior_probs + sims - (prior_probs * sims)
- final_kb_ids.append(
- candidates[scores.argmax().item()].entity_id_
- if self.threshold is None
- or scores.max() >= self.threshold
- else EntityLinker.NIL
- )
- self._add_activations(
- doc_scores=doc_scores,
- doc_ents=doc_ents,
- scores=scores,
- ents=[c.entity_id for c in candidates],
+ # cosine similarity
+ sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+ sentence_norm * entity_norm
)
if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161)
@@ -590,27 +584,35 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
method="predict", msg="result variables not of equal length"
)
raise RuntimeError(err)
- return {
- KNOWLEDGE_BASE_IDS: final_kb_ids,
- "ents": docs_ents,
- "scores": docs_scores,
- }
+ return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced
by EntityLinker.predict.
+ activations (ActivationsT): The activations used for setting annotations, produced
+ by EntityLinker.predict.
DOCS: https://spacy.io/api/entitylinker#set_annotations
"""
kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
+ kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
count_ents = len([ent for doc in docs for ent in doc.ents])
if count_ents != len(kb_ids):
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
i = 0
overwrite = self.cfg["overwrite"]
+ for j, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ if act_name != KNOWLEDGE_BASE_IDS:
+ # We only copy activations that are Ragged.
+ doc.activations[self.name][act_name] = cast(Ragged, acts[j])
+
for j, doc in enumerate(docs):
if self.save_activations:
doc.activations[self.name] = {}
@@ -746,3 +748,32 @@ def _add_activations(
ops = self.model.ops
doc_scores.append(ops.asarray1f(scores))
doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
+
+ def _add_doc_activations(
+ self,
+ *,
+ docs_scores: List[Ragged],
+ docs_ents: List[Ragged],
+ doc_scores: List[Floats1d],
+ doc_ents: List[Ints1d],
+ ):
+ if not self.save_activations:
+ return
+ ops = self.model.ops
+ lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
+ docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
+ docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
+
+ def _add_activations(
+ self,
+ *,
+ doc_scores: List[Floats1d],
+ doc_ents: List[Ints1d],
+ scores: Sequence[float],
+ ents: Sequence[int],
+ ):
+ if not self.save_activations:
+ return
+ ops = self.model.ops
+ doc_scores.append(ops.asarray1f(scores))
+ doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 43e36b36844..c26be7912a9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,4 +1,8 @@
# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import srsly
+from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.types import Floats2d, Ints1d
from itertools import islice
from typing import Callable, Dict, Iterable, Optional, Union
@@ -8,6 +12,12 @@ from ..morphology cimport Morphology
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
+from ..parts_of_speech import IDS as POS_IDS
+from ..symbols import POS
+from ..language import Language
+from ..errors import Errors
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
from .. import util
from ..errors import Errors
from ..language import Language
@@ -58,6 +68,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"save_activations": False,
},
+ default_config={
+ "model": DEFAULT_MORPH_MODEL,
+ "overwrite": True,
+ "extend": False,
+ "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+ "save_activations": False,
+ },
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
@@ -69,9 +86,12 @@ def make_morphologizer(
label_smoothing: float,
scorer: Optional[Callable],
save_activations: bool,
+ save_activations: bool,
):
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
save_activations=save_activations)
+ return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
+ save_activations=save_activations)
def morphologizer_score(examples, **kwargs):
@@ -107,6 +127,7 @@ class Morphologizer(Tagger):
extend: bool = False,
scorer: Optional[Callable] = morphologizer_score,
save_activations: bool = False,
+ save_activations: bool = False,
):
"""Initialize a morphologizer.
@@ -120,6 +141,7 @@ class Morphologizer(Tagger):
Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph".
save_activations (bool): save model activations in Doc when annotating.
+ save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/morphologizer#init
"""
@@ -141,6 +163,7 @@ class Morphologizer(Tagger):
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
self.save_activations = save_activations
+ self.save_activations = save_activations
@property
def labels(self):
@@ -234,15 +257,18 @@ class Morphologizer(Tagger):
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
+ activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
DOCS: https://spacy.io/api/morphologizer#set_annotations
"""
batch_tag_ids = activations["label_ids"]
+ batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
@@ -253,6 +279,10 @@ class Morphologizer(Tagger):
# to allocate a compatible container out of the iterable.
labels = tuple(self.labels)
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ doc.activations[self.name][act_name] = acts[i]
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 51670dcf8cf..dd56b4a62e6 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,12 +1,16 @@
# cython: infer_types=True, profile=True, binding=True
+from typing import Dict, Iterable, Optional, Callable, List, Union
from itertools import islice
from typing import Callable, Dict, Iterable, List, Optional, Union
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+import srsly
+from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
from ..tokens.doc cimport Doc
-from .. import util
+from .tagger import ActivationsT, Tagger
+from ..language import Language
from ..errors import Errors
from ..language import Language
from ..scorer import Scorer
@@ -40,6 +44,12 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
"save_activations": False,
},
+ default_config={
+ "model": DEFAULT_SENTER_MODEL,
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.senter_scorer.v1"},
+ "save_activations": False,
+ },
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_senter(nlp: Language,
@@ -49,6 +59,13 @@ def make_senter(nlp: Language,
scorer: Optional[Callable],
save_activations: bool):
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
+def make_senter(nlp: Language,
+ name: str,
+ model: Model,
+ overwrite: bool,
+ scorer: Optional[Callable],
+ save_activations: bool):
+ return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
def senter_score(examples, **kwargs):
@@ -79,6 +96,7 @@ class SentenceRecognizer(Tagger):
overwrite=False,
scorer=senter_score,
save_activations: bool = False,
+ save_activations: bool = False,
):
"""Initialize a sentence recognizer.
@@ -90,6 +108,7 @@ class SentenceRecognizer(Tagger):
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
save_activations (bool): save model activations in Doc when annotating.
+ save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/sentencerecognizer#init
"""
@@ -100,6 +119,7 @@ class SentenceRecognizer(Tagger):
self.cfg = {"overwrite": overwrite}
self.scorer = scorer
self.save_activations = save_activations
+ self.save_activations = save_activations
@property
def labels(self):
@@ -117,20 +137,27 @@ class SentenceRecognizer(Tagger):
def label_data(self):
return None
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
+ activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
"""
batch_tag_ids = activations["label_ids"]
+ batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ doc.activations[self.name][act_name] = acts[i]
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 72fd78f461e..d800a4d484b 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,18 +1,8 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import (
- Any,
- Callable,
- Dict,
- Iterable,
- List,
- Optional,
- Protocol,
- Tuple,
- Union,
- cast,
- runtime_checkable,
-)
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
+from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
+from thinc.api import Optimizer
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
import numpy
from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
@@ -36,6 +26,9 @@
ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+
+
spancat_default_config = """
[model]
@architectures = "spacy.SpanCategorizer.v1"
@@ -191,6 +184,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"save_activations": False,
+ "save_activations": False,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
@@ -204,6 +198,7 @@ def make_spancat(
threshold: float,
max_positive: Optional[int],
save_activations: bool,
+ save_activations: bool,
) -> "SpanCategorizer":
"""Create a SpanCategorizer component and configure it for multi-label
classification to be able to assign multiple labels for each span.
@@ -232,6 +227,7 @@ def make_spancat(
max_positive (Optional[int]): Maximum number of labels to consider positive
per span. Defaults to None, indicating no limit.
save_activations (bool): save model activations in Doc when annotating.
+ save_activations (bool): save model activations in Doc when annotating.
"""
return SpanCategorizer(
nlp.vocab,
@@ -311,6 +307,7 @@ def make_spancat_singlelabel(
threshold=None,
scorer=scorer,
save_activations=save_activations,
+ save_activations=save_activations,
)
@@ -374,6 +371,7 @@ def __init__(
threshold: Optional[float] = 0.5,
scorer: Optional[Callable] = spancat_score,
save_activations: bool = False,
+ save_activations: bool = False,
) -> None:
"""Initialize the multi-label or multi-class span categorizer.
@@ -424,6 +422,7 @@ def __init__(
self.name = name
self.scorer = scorer
self.save_activations = save_activations
+ self.save_activations = save_activations
@property
def key(self) -> str:
@@ -481,6 +480,7 @@ def label_data(self) -> List[str]:
"""
return list(self.labels)
+ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
@@ -492,6 +492,8 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
indices = self.suggester(docs, ops=self.model.ops)
scores = self.model.predict((docs, indices)) # type: ignore
return {"indices": indices, "scores": scores}
+ scores = self.model.predict((docs, indices)) # type: ignore
+ return {"indices": indices, "scores": scores}
def set_candidates(
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@@ -511,11 +513,13 @@ def set_candidates(
for index in candidates.dataXd:
doc.spans[candidates_key].append(doc[index[0] : index[1]])
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
+ activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
DOCS: https://spacy.io/api/spancategorizer#set_annotations
"""
@@ -524,9 +528,10 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non
indices = activations["indices"]
assert isinstance(indices, Ragged)
scores = cast(Floats2d, activations["scores"])
+
offset = 0
for i, doc in enumerate(docs):
- indices_i = cast(Ints2d, indices[i].dataXd)
+ indices_i = indices[i].dataXd
if self.save_activations:
doc.activations[self.name] = {}
doc.activations[self.name]["indices"] = indices_i
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 21c7b3ab0a3..95016072ef3 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,9 @@
# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import numpy
+import srsly
+from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
import warnings
from itertools import islice
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
@@ -22,8 +27,12 @@ from ..util import registry
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
+
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+# See #9050
+BACKWARD_OVERWRITE = False
+
default_model_config = """
[model]
@architectures = "spacy.Tagger.v2"
@@ -51,6 +60,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
"neg_prefix": "!",
"save_activations": False,
},
+ default_config={
+ "model": DEFAULT_TAGGER_MODEL,
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
+ "neg_prefix": "!",
+ "save_activations": False,
+ },
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(
@@ -61,6 +77,7 @@ def make_tagger(
scorer: Optional[Callable],
neg_prefix: str,
save_activations: bool,
+ save_activations: bool,
):
"""Construct a part-of-speech tagger component.
@@ -71,6 +88,8 @@ def make_tagger(
"""
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
save_activations=save_activations)
+ return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
+ save_activations=save_activations)
def tagger_score(examples, **kwargs):
@@ -97,6 +116,7 @@ class Tagger(TrainablePipe):
scorer=tagger_score,
neg_prefix="!",
save_activations: bool = False,
+ save_activations: bool = False,
):
"""Initialize a part-of-speech tagger.
@@ -108,6 +128,7 @@ class Tagger(TrainablePipe):
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "tag".
save_activations (bool): save model activations in Doc when annotating.
+ save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/tagger#init
"""
@@ -119,6 +140,7 @@ class Tagger(TrainablePipe):
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
self.save_activations = save_activations
+ self.save_activations = save_activations
@property
def labels(self):
@@ -137,6 +159,7 @@ class Tagger(TrainablePipe):
"""Data about the labels currently added to the component."""
return tuple(self.cfg["labels"])
+ def predict(self, docs) -> ActivationsT:
def predict(self, docs) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
@@ -151,11 +174,13 @@ class Tagger(TrainablePipe):
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
assert len(guesses) == len(docs)
return {"probabilities": guesses, "label_ids": guesses}
+ return {"probabilities": guesses, "label_ids": guesses}
scores = self.model.predict(docs)
assert len(scores) == len(docs), (len(scores), len(docs))
guesses = self._scores2guesses(scores)
assert len(guesses) == len(docs)
return {"probabilities": scores, "label_ids": guesses}
+ return {"probabilities": scores, "label_ids": guesses}
def _scores2guesses(self, scores):
guesses = []
@@ -166,21 +191,28 @@ class Tagger(TrainablePipe):
guesses.append(doc_guesses)
return guesses
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
+ activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
DOCS: https://spacy.io/api/tagger#set_annotations
"""
batch_tag_ids = activations["label_ids"]
+ batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
labels = self.labels
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ doc.activations[self.name][act_name] = acts[i]
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
@@ -270,7 +302,7 @@ class Tagger(TrainablePipe):
student_scores: Scores representing the student model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
-
+
DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
"""
loss_func = SequenceCategoricalCrossentropy(normalize=False)
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 13841dd7bbb..79a98b9bc5f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
from itertools import islice
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 309b9a84443..d38beb441da 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
+from thinc.types import Floats2d
+from thinc.api import Model, Config
+
from itertools import islice
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
@@ -80,6 +84,8 @@
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
"save_activations": False,
+ "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+ "save_activations": False,
},
default_score_weights={
"cats_score": 1.0,
@@ -101,6 +107,9 @@ def make_multilabel_textcat(
threshold: float,
scorer: Optional[Callable],
save_activations: bool,
+) -> "TextCategorizer":
+ """Create a TextCategorizer component. The text categorizer predicts categories
+ save_activations: bool,
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
@@ -119,6 +128,12 @@ def make_multilabel_textcat(
threshold=threshold,
scorer=scorer,
save_activations=save_activations,
+ nlp.vocab,
+ model,
+ name,
+ threshold=threshold,
+ scorer=scorer,
+ save_activations=save_activations,
)
@@ -151,6 +166,7 @@ def __init__(
threshold: float,
scorer: Optional[Callable] = textcat_multilabel_score,
save_activations: bool = False,
+ save_activations: bool = False,
) -> None:
"""Initialize a text categorizer for multi-label classification.
@@ -159,7 +175,6 @@ def __init__(
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
- scorer (Optional[Callable]): The scoring method.
save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/textcategorizer#init
@@ -172,6 +187,7 @@ def __init__(
self.cfg = dict(cfg)
self.scorer = scorer
self.save_activations = save_activations
+ self.save_activations = save_activations
@property
def support_missing_values(self):
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 065a6c20d62..b9c297990f9 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -2,10 +2,14 @@
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
import srsly
-from thinc.api import Model, Optimizer, set_dropout_rate
+from thinc.api import set_dropout_rate, Model, Optimizer
+import warnings
from ..tokens.doc cimport Doc
+from ..training import validate_examples
+from ..errors import Errors, Warnings
+from .pipe import Pipe, deserialize_config
from .. import util
from ..errors import Errors
from ..language import Language
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 7465c844492..e423965bedc 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,3 +1,4 @@
+from typing import cast
import pickle
from typing import cast
@@ -10,6 +11,7 @@
from spacy.language import Language
from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
from spacy.pipeline.trainable_pipe import TrainablePipe
+from spacy.training import Example
from spacy.strings import StringStore
from spacy.training import Example
from spacy.util import make_tempdir
@@ -403,3 +405,26 @@ def test_save_activations():
]
assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
+
+
+def test_save_activations():
+ nlp = English()
+ lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
+ lemmatizer.min_tree_freq = 1
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+ nlp.initialize(get_examples=lambda: train_examples)
+ nO = lemmatizer.model.get_dim("nO")
+
+ doc = nlp("This is a test.")
+ assert "trainable_lemmatizer" not in doc.activations
+
+ lemmatizer.save_activations = True
+ doc = nlp("This is a test.")
+ assert list(doc.activations["trainable_lemmatizer"].keys()) == [
+ "probabilities",
+ "tree_ids",
+ ]
+ assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
+ assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index e44fef2ad25..804332a9ae8 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,8 +1,9 @@
-from typing import Any, Callable, Dict, Iterable, cast
+from typing import Callable, Iterable, Dict, Any, cast
import pytest
from numpy.testing import assert_equal
from thinc.types import Ragged
+from thinc.types import Ragged
from spacy import Language, registry, util
from spacy.attrs import ENT_KB_ID
@@ -10,8 +11,8 @@
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
from spacy.lang.en import English
from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker, get_candidates
from spacy.pipeline import EntityLinker, TrainablePipe
+from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from spacy.tests.util import make_tempdir
@@ -1292,6 +1293,7 @@ def create_kb(vocab):
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
+def test_save_activations():
def test_save_activations():
nlp = English()
vector_length = 3
@@ -1307,7 +1309,7 @@ def create_kb(vocab):
# create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
- mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
+ mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 542d14d1516..bf2eea8a94e 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,5 +1,4 @@
from typing import cast
-
import pytest
from numpy.testing import assert_almost_equal, assert_equal
from thinc.api import get_current_ops
@@ -10,7 +9,7 @@
from spacy.language import Language
from spacy.morphology import Morphology
from spacy.pipeline import TrainablePipe
-from spacy.tests.util import make_tempdir
+from spacy.attrs import MORPH
from spacy.tokens import Doc
from spacy.training import Example
@@ -255,3 +254,25 @@ def test_save_activations():
}
assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
+
+
+def test_save_activations():
+ nlp = English()
+ morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
+ train_examples = []
+ for inst in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
+ nlp.initialize(get_examples=lambda: train_examples)
+
+ doc = nlp("This is a test.")
+ assert "morphologizer" not in doc.activations
+
+ morphologizer.save_activations = True
+ doc = nlp("This is a test.")
+ assert "morphologizer" in doc.activations
+ assert set(doc.activations["morphologizer"].keys()) == {
+ "label_ids",
+ "probabilities",
+ }
+ assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
+ assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 51f943898f1..a594c10b04c 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,5 +1,4 @@
from typing import cast
-
import pytest
from numpy.testing import assert_equal
@@ -8,6 +7,7 @@
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TrainablePipe
+from spacy.pipeline import TrainablePipe
from spacy.tests.util import make_tempdir
from spacy.training import Example
@@ -133,3 +133,26 @@ def test_save_activations():
assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
assert doc.activations["senter"]["probabilities"].shape == (5, nO)
assert doc.activations["senter"]["label_ids"].shape == (5,)
+
+
+def test_save_activations():
+ # Test if activations are correctly added to Doc when requested.
+ nlp = English()
+ senter = cast(TrainablePipe, nlp.add_pipe("senter"))
+
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+ nlp.initialize(get_examples=lambda: train_examples)
+ nO = senter.model.get_dim("nO")
+
+ doc = nlp("This is a test.")
+ assert "senter" not in doc.activations
+
+ senter.save_activations = True
+ doc = nlp("This is a test.")
+ assert "senter" in doc.activations
+ assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
+ assert doc.activations["senter"]["probabilities"].shape == (5, nO)
+ assert doc.activations["senter"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 05e814f0733..50cc828a038 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,5 +1,4 @@
from typing import cast
-
import pytest
from numpy.testing import assert_almost_equal, assert_equal
from thinc.api import compounding, get_current_ops
@@ -9,7 +8,7 @@
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TrainablePipe
-from spacy.training import Example
+from thinc.api import compounding
from ..util import make_tempdir
@@ -240,52 +239,6 @@ def test_overfitting_IO():
assert doc3[0].tag_ != "N"
-def test_is_distillable():
- nlp = English()
- tagger = nlp.add_pipe("tagger")
- assert tagger.is_distillable
-
-
-def test_distill():
- teacher = English()
- teacher_tagger = teacher.add_pipe("tagger")
- train_examples = []
- for t in TRAIN_DATA:
- train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
-
- optimizer = teacher.initialize(get_examples=lambda: train_examples)
-
- for i in range(50):
- losses = {}
- teacher.update(train_examples, sgd=optimizer, losses=losses)
- assert losses["tagger"] < 0.00001
-
- student = English()
- student_tagger = student.add_pipe("tagger")
- student_tagger.min_tree_freq = 1
- student_tagger.initialize(
- get_examples=lambda: train_examples, labels=teacher_tagger.label_data
- )
-
- distill_examples = [
- Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
- ]
-
- for i in range(50):
- losses = {}
- student_tagger.distill(
- teacher_tagger, distill_examples, sgd=optimizer, losses=losses
- )
- assert losses["tagger"] < 0.00001
-
- test_text = "I like blue eggs"
- doc = student(test_text)
- assert doc[0].tag_ == "N"
- assert doc[1].tag_ == "V"
- assert doc[2].tag_ == "J"
- assert doc[3].tag_ == "N"
-
-
def test_save_activations():
# Test if activations are correctly added to Doc when requested.
nlp = English()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 2bba40d1d13..a54bf394608 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,3 +1,4 @@
+from typing import cast
import random
from typing import cast
@@ -13,16 +14,12 @@
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TextCategorizer, TrainablePipe
-from spacy.pipeline.textcat import (
- single_label_bow_config,
- single_label_cnn_config,
- single_label_default_config,
-)
-from spacy.pipeline.textcat_multilabel import (
- multi_label_bow_config,
- multi_label_cnn_config,
- multi_label_default_config,
-)
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from spacy.tokens import Doc, DocBin
@@ -304,6 +301,7 @@ def test_issue9904():
examples = get_examples()
scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
+ scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
loss = textcat.get_loss(examples, scores)[0]
loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
@@ -962,9 +960,11 @@ def test_textcat_multi_threshold():
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
+def test_save_activations():
def test_save_activations():
nlp = English()
textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
+ textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
train_examples = []
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@@ -981,6 +981,34 @@ def test_save_activations():
assert doc.activations["textcat"]["probabilities"].shape == (nO,)
+def test_save_activations_multi():
+ nlp = English()
+ textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
+
+ train_examples = []
+ for text, annotations in TRAIN_DATA_MULTI_LABEL:
+ train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+ nlp.initialize(get_examples=lambda: train_examples)
+ nO = textcat.model.get_dim("nO")
+
+ doc = nlp("This is a test.")
+ assert "textcat_multilabel" not in doc.activations
+
+ textcat.save_activations = True
+ doc = nlp("This is a test.")
+ assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
+ assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
+ nO = textcat.model.get_dim("nO")
+
+ doc = nlp("This is a test.")
+ assert "textcat" not in doc.activations
+
+ textcat.save_activations = True
+ doc = nlp("This is a test.")
+ assert list(doc.activations["textcat"].keys()) == ["probabilities"]
+ assert doc.activations["textcat"]["probabilities"].shape == (nO,)
+
+
def test_save_activations_multi():
nlp = English()
textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 9fb6a72c87f..fc0404f1423 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,8 @@ cdef class Doc:
cdef public dict activations
+ cdef public dict activations
+
cdef public dict user_hooks
cdef public dict user_token_hooks
cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 1304a8aae8d..d83aa0e5486 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -15,7 +15,7 @@ from typing import (
import numpy as np
from cymem.cymem import Pool
-from thinc.types import Floats1d, Floats2d, Ints2d
+from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
from .span import Span
from .token import Token
from .span_groups import SpanGroups
@@ -37,6 +37,7 @@ class Doc:
spans: SpanGroups
max_length: int
length: int
+ sentiment: float
activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
cats: Dict[str, float]
user_hooks: Dict[str, Callable[..., Any]]
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index e92c0e833e0..842b2181a81 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -762,6 +762,7 @@ The L2 norm of the document's vector representation.
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
| `lang` 2.1 | Language of the document's vocabulary. ~~int~~ |
| `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ |
+| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index a5dd721e37a..074d1706ac7 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
-| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 61abe043e77..4660ec312fa 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description |
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ |
+| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
@@ -454,8 +454,8 @@ coarse-grained POS as the feature `POS`.
> assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
> ```
-| Name | Description |
-| ----------- | ------------------------------------------------------ |
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
| **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
## Morphologizer.label_data {id="label_data",tag="property",version="3"}